Loading dblp/download_prepare.sh +8 −3 Original line number Diff line number Diff line Loading @@ -18,9 +18,14 @@ gzip -d dblp-2017-11-01.xml.gz # Convert XML to bracket notation. ./dblp_to_bracket.py # Remove 'www' entries. awk '!/{www{key{homepages/' dblp.bracket > dblp_no_www.bracket # Sort the dataset. ./../utilities/sort_dataset.sh dblp.bracket ./../utilities/sort_dataset.sh dblp_no_www.bracket > dblp_no_www_sorted.bracket # Tidy up. # rm *xml* # rm *.dtd rm *xml* rm *.dtd rm dblp.bracket rm dblp_no_www.bracket dblp/tidy-up.shdeleted 100644 → 0 +0 −5 Original line number Diff line number Diff line #!/bin/bash # Delete all downloaded files. rm *xml* rm *.dtd utilities/sort_dataset.sh +1 −1 Original line number Diff line number Diff line Loading @@ -32,4 +32,4 @@ # NOTE: We substract the escaped brackets because they're part of node labels. # cat $input | awk '{print gsub("{","{")-gsub("\\\\{","\\{"), $0}' | \ sort -n --buffer-size=4G | cut -d' ' -f2- sort -n --buffer-size=10G | cut -d' ' -f2- Loading
dblp/download_prepare.sh +8 −3 Original line number Diff line number Diff line Loading @@ -18,9 +18,14 @@ gzip -d dblp-2017-11-01.xml.gz # Convert XML to bracket notation. ./dblp_to_bracket.py # Remove 'www' entries. awk '!/{www{key{homepages/' dblp.bracket > dblp_no_www.bracket # Sort the dataset. ./../utilities/sort_dataset.sh dblp.bracket ./../utilities/sort_dataset.sh dblp_no_www.bracket > dblp_no_www_sorted.bracket # Tidy up. # rm *xml* # rm *.dtd rm *xml* rm *.dtd rm dblp.bracket rm dblp_no_www.bracket
dblp/tidy-up.shdeleted 100644 → 0 +0 −5 Original line number Diff line number Diff line #!/bin/bash # Delete all downloaded files. rm *xml* rm *.dtd
utilities/sort_dataset.sh +1 −1 Original line number Diff line number Diff line Loading @@ -32,4 +32,4 @@ # NOTE: We substract the escaped brackets because they're part of node labels. # cat $input | awk '{print gsub("{","{")-gsub("\\\\{","\\{"), $0}' | \ sort -n --buffer-size=4G | cut -d' ' -f2- sort -n --buffer-size=10G | cut -d' ' -f2-