first commit
This commit is contained in:
commit
e7f6f7b021
9
README.md
Normal file
9
README.md
Normal file
@ -0,0 +1,9 @@
|
||||
Get list all domain urls
|
||||
|
||||
- Ignores urls with `sites/*`
|
||||
- Unify urls for media files, node/*, content/*
|
||||
|
||||
```sh
|
||||
./getlinks.sh https://example.com/ # generate file example.com.urls
|
||||
```
|
||||
|
||||
38
getlinks.sh
Executable file
38
getlinks.sh
Executable file
@ -0,0 +1,38 @@
|
||||
#!/bin/bash
|
||||
|
||||
DOMAIN=$1
|
||||
FILE=$(echo $DOMAIN | sed 's/\/*$//g' | sed 's/www\.//' | awk -F// '{print $2}').urls
|
||||
|
||||
echo
|
||||
echo " URLs from : $DOMAIN"
|
||||
echo " To file : $FILE"
|
||||
|
||||
wget --mirror --delete-after --no-directories $DOMAIN 2>&1 | grep '^--' | awk '{print $3}' > .$FILE
|
||||
|
||||
for line in `cat .$FILE`; do
|
||||
# ignore url sites/*
|
||||
if [[ $line == */sites/* ]]; then
|
||||
continue
|
||||
# unify paths for files, node/*, content/*
|
||||
elif [[
|
||||
$line == *pdf ||
|
||||
$line == *doc ||
|
||||
$line == *png ||
|
||||
$line == *css ||
|
||||
$line == *svg ||
|
||||
$line == *xml ||
|
||||
$line == *gif ||
|
||||
$line == *jpg ||
|
||||
$line =~ /node/[0-9]{1,6}$ ||
|
||||
$line =~ /content/[0-9]{1,6}$
|
||||
]]; then
|
||||
dirname $line | sed 's/^.*\/\/[^\/]*\//\//' | sed 's/^\/en\//\//' | sed 's/^\/el\//\//' >> .$FILE.tmp
|
||||
else
|
||||
echo $line | sed 's/^.*\/\/[^\/]*\//\//' | sed 's/^\/en\//\//' | sed 's/^\/el\//\//' >> .$FILE.tmp
|
||||
fi
|
||||
done
|
||||
|
||||
cat .$FILE.tmp | sort | uniq > $FILE
|
||||
|
||||
echo " URLs found : $(cat $FILE | wc -l)"
|
||||
echo
|
||||
Loading…
x
Reference in New Issue
Block a user