39 lines
1011 B
Bash
Executable File
39 lines
1011 B
Bash
Executable File
#!/bin/bash
|
|
|
|
DOMAIN=$1
|
|
FILE=$(echo $DOMAIN | sed 's/\/*$//g' | sed 's/www\.//' | awk -F// '{print $2}').urls
|
|
|
|
echo
|
|
echo " URLs from : $DOMAIN"
|
|
echo " To file : $FILE"
|
|
|
|
wget --mirror --delete-after --no-directories $DOMAIN 2>&1 | grep '^--' | awk '{print $3}' > .$FILE
|
|
|
|
for line in `cat .$FILE`; do
|
|
# ignore url sites/*
|
|
if [[ $line == */sites/* ]]; then
|
|
continue
|
|
# unify paths for files, node/*, content/*
|
|
elif [[
|
|
$line == *pdf ||
|
|
$line == *doc ||
|
|
$line == *png ||
|
|
$line == *css ||
|
|
$line == *svg ||
|
|
$line == *xml ||
|
|
$line == *gif ||
|
|
$line == *jpg ||
|
|
$line =~ /node/[0-9]{1,6}$ ||
|
|
$line =~ /content/[0-9]{1,6}$
|
|
]]; then
|
|
dirname $line | sed 's/^.*\/\/[^\/]*\//\//' | sed 's/^\/en\//\//' | sed 's/^\/el\//\//' >> .$FILE.tmp
|
|
else
|
|
echo $line | sed 's/^.*\/\/[^\/]*\//\//' | sed 's/^\/en\//\//' | sed 's/^\/el\//\//' >> .$FILE.tmp
|
|
fi
|
|
done
|
|
|
|
cat .$FILE.tmp | sort | uniq > $FILE
|
|
|
|
echo " URLs found : $(cat $FILE | wc -l)"
|
|
echo
|