commit e7f6f7b02115ae8a85163ffed8393257bddad385 Author: Ste Vaidis Date: Fri Dec 22 13:22:56 2023 +0200 first commit diff --git a/README.md b/README.md new file mode 100644 index 0000000..05955ac --- /dev/null +++ b/README.md @@ -0,0 +1,9 @@ +Get list all domain urls + +- Ignores urls with `sites/*` +- Unify urls for media files, node/*, content/* + +```sh +./getlinks.sh https://example.com/ # generate file example.com.urls +``` + diff --git a/getlinks.sh b/getlinks.sh new file mode 100755 index 0000000..1262679 --- /dev/null +++ b/getlinks.sh @@ -0,0 +1,38 @@ +#!/bin/bash + +DOMAIN=$1 +FILE=$(echo $DOMAIN | sed 's/\/*$//g' | sed 's/www\.//' | awk -F// '{print $2}').urls + +echo +echo " URLs from : $DOMAIN" +echo " To file : $FILE" + +wget --mirror --delete-after --no-directories $DOMAIN 2>&1 | grep '^--' | awk '{print $3}' > .$FILE + +for line in `cat .$FILE`; do + # ignore url sites/* + if [[ $line == */sites/* ]]; then + continue + # unify paths for files, node/*, content/* + elif [[ + $line == *pdf || + $line == *doc || + $line == *png || + $line == *css || + $line == *svg || + $line == *xml || + $line == *gif || + $line == *jpg || + $line =~ /node/[0-9]{1,6}$ || + $line =~ /content/[0-9]{1,6}$ + ]]; then + dirname $line | sed 's/^.*\/\/[^\/]*\//\//' | sed 's/^\/en\//\//' | sed 's/^\/el\//\//' >> .$FILE.tmp + else + echo $line | sed 's/^.*\/\/[^\/]*\//\//' | sed 's/^\/en\//\//' | sed 's/^\/el\//\//' >> .$FILE.tmp + fi +done + +cat .$FILE.tmp | sort | uniq > $FILE + +echo " URLs found : $(cat $FILE | wc -l)" +echo