first commit
This commit is contained in:
commit
e7f6f7b021
9
README.md
Normal file
9
README.md
Normal file
@ -0,0 +1,9 @@
|
|||||||
|
Get list all domain urls
|
||||||
|
|
||||||
|
- Ignores urls with `sites/*`
|
||||||
|
- Unify urls for media files, node/*, content/*
|
||||||
|
|
||||||
|
```sh
|
||||||
|
./getlinks.sh https://example.com/ # generate file example.com.urls
|
||||||
|
```
|
||||||
|
|
||||||
38
getlinks.sh
Executable file
38
getlinks.sh
Executable file
@ -0,0 +1,38 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
DOMAIN=$1
|
||||||
|
FILE=$(echo $DOMAIN | sed 's/\/*$//g' | sed 's/www\.//' | awk -F// '{print $2}').urls
|
||||||
|
|
||||||
|
echo
|
||||||
|
echo " URLs from : $DOMAIN"
|
||||||
|
echo " To file : $FILE"
|
||||||
|
|
||||||
|
wget --mirror --delete-after --no-directories $DOMAIN 2>&1 | grep '^--' | awk '{print $3}' > .$FILE
|
||||||
|
|
||||||
|
for line in `cat .$FILE`; do
|
||||||
|
# ignore url sites/*
|
||||||
|
if [[ $line == */sites/* ]]; then
|
||||||
|
continue
|
||||||
|
# unify paths for files, node/*, content/*
|
||||||
|
elif [[
|
||||||
|
$line == *pdf ||
|
||||||
|
$line == *doc ||
|
||||||
|
$line == *png ||
|
||||||
|
$line == *css ||
|
||||||
|
$line == *svg ||
|
||||||
|
$line == *xml ||
|
||||||
|
$line == *gif ||
|
||||||
|
$line == *jpg ||
|
||||||
|
$line =~ /node/[0-9]{1,6}$ ||
|
||||||
|
$line =~ /content/[0-9]{1,6}$
|
||||||
|
]]; then
|
||||||
|
dirname $line | sed 's/^.*\/\/[^\/]*\//\//' | sed 's/^\/en\//\//' | sed 's/^\/el\//\//' >> .$FILE.tmp
|
||||||
|
else
|
||||||
|
echo $line | sed 's/^.*\/\/[^\/]*\//\//' | sed 's/^\/en\//\//' | sed 's/^\/el\//\//' >> .$FILE.tmp
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
cat .$FILE.tmp | sort | uniq > $FILE
|
||||||
|
|
||||||
|
echo " URLs found : $(cat $FILE | wc -l)"
|
||||||
|
echo
|
||||||
Loading…
x
Reference in New Issue
Block a user