first commit

This commit is contained in:
Ste Vaidis 2023-12-22 13:22:56 +02:00
commit e7f6f7b021
2 changed files with 47 additions and 0 deletions

9
README.md Normal file
View File

@ -0,0 +1,9 @@
Get list all domain urls
- Ignores urls with `sites/*`
- Unify urls for media files, node/*, content/*
```sh
./getlinks.sh https://example.com/ # generate file example.com.urls
```

38
getlinks.sh Executable file
View File

@ -0,0 +1,38 @@
#!/bin/bash
DOMAIN=$1
FILE=$(echo $DOMAIN | sed 's/\/*$//g' | sed 's/www\.//' | awk -F// '{print $2}').urls
echo
echo " URLs from : $DOMAIN"
echo " To file : $FILE"
wget --mirror --delete-after --no-directories $DOMAIN 2>&1 | grep '^--' | awk '{print $3}' > .$FILE
for line in `cat .$FILE`; do
# ignore url sites/*
if [[ $line == */sites/* ]]; then
continue
# unify paths for files, node/*, content/*
elif [[
$line == *pdf ||
$line == *doc ||
$line == *png ||
$line == *css ||
$line == *svg ||
$line == *xml ||
$line == *gif ||
$line == *jpg ||
$line =~ /node/[0-9]{1,6}$ ||
$line =~ /content/[0-9]{1,6}$
]]; then
dirname $line | sed 's/^.*\/\/[^\/]*\//\//' | sed 's/^\/en\//\//' | sed 's/^\/el\//\//' >> .$FILE.tmp
else
echo $line | sed 's/^.*\/\/[^\/]*\//\//' | sed 's/^\/en\//\//' | sed 's/^\/el\//\//' >> .$FILE.tmp
fi
done
cat .$FILE.tmp | sort | uniq > $FILE
echo " URLs found : $(cat $FILE | wc -l)"
echo