get-links/getlinks.sh
2023-12-22 13:22:56 +02:00

39 lines
1011 B
Bash
Executable File

#!/bin/bash
DOMAIN=$1
FILE=$(echo $DOMAIN | sed 's/\/*$//g' | sed 's/www\.//' | awk -F// '{print $2}').urls
echo
echo " URLs from : $DOMAIN"
echo " To file : $FILE"
wget --mirror --delete-after --no-directories $DOMAIN 2>&1 | grep '^--' | awk '{print $3}' > .$FILE
for line in `cat .$FILE`; do
# ignore url sites/*
if [[ $line == */sites/* ]]; then
continue
# unify paths for files, node/*, content/*
elif [[
$line == *pdf ||
$line == *doc ||
$line == *png ||
$line == *css ||
$line == *svg ||
$line == *xml ||
$line == *gif ||
$line == *jpg ||
$line =~ /node/[0-9]{1,6}$ ||
$line =~ /content/[0-9]{1,6}$
]]; then
dirname $line | sed 's/^.*\/\/[^\/]*\//\//' | sed 's/^\/en\//\//' | sed 's/^\/el\//\//' >> .$FILE.tmp
else
echo $line | sed 's/^.*\/\/[^\/]*\//\//' | sed 's/^\/en\//\//' | sed 's/^\/el\//\//' >> .$FILE.tmp
fi
done
cat .$FILE.tmp | sort | uniq > $FILE
echo " URLs found : $(cat $FILE | wc -l)"
echo