#!/bin/bash DOMAIN=$1 FILE=$(echo $DOMAIN | sed 's/\/*$//g' | sed 's/www\.//' | awk -F// '{print $2}').urls echo echo " URLs from : $DOMAIN" echo " To file : $FILE" wget --mirror --delete-after --no-directories $DOMAIN 2>&1 | grep '^--' | awk '{print $3}' > .$FILE for line in `cat .$FILE`; do # ignore url sites/* if [[ $line == */sites/* ]]; then continue # unify paths for files, node/*, content/* elif [[ $line == *pdf || $line == *doc || $line == *png || $line == *css || $line == *svg || $line == *xml || $line == *gif || $line == *jpg || $line =~ /node/[0-9]{1,6}$ || $line =~ /content/[0-9]{1,6}$ ]]; then dirname $line | sed 's/^.*\/\/[^\/]*\//\//' | sed 's/^\/en\//\//' | sed 's/^\/el\//\//' >> .$FILE.tmp else echo $line | sed 's/^.*\/\/[^\/]*\//\//' | sed 's/^\/en\//\//' | sed 's/^\/el\//\//' >> .$FILE.tmp fi done cat .$FILE.tmp | sort | uniq > $FILE echo " URLs found : $(cat $FILE | wc -l)" echo