Commit deabcccb authored by Mateusz Pawlik's avatar Mateusz Pawlik

Finalized dblp and tested the entire pipeline.

parent 635388e3
...@@ -18,9 +18,14 @@ gzip -d dblp-2017-11-01.xml.gz ...@@ -18,9 +18,14 @@ gzip -d dblp-2017-11-01.xml.gz
# Convert XML to bracket notation. # Convert XML to bracket notation.
./dblp_to_bracket.py ./dblp_to_bracket.py
# Remove 'www' entries.
awk '!/{www{key{homepages/' dblp.bracket > dblp_no_www.bracket
# Sort the dataset. # Sort the dataset.
./../utilities/sort_dataset.sh dblp.bracket ./../utilities/sort_dataset.sh dblp_no_www.bracket > dblp_no_www_sorted.bracket
# Tidy up. # Tidy up.
# rm *xml* rm *xml*
# rm *.dtd rm *.dtd
rm dblp.bracket
rm dblp_no_www.bracket
#!/bin/bash
# Delete all downloaded files.
rm *xml*
rm *.dtd
...@@ -32,4 +32,4 @@ ...@@ -32,4 +32,4 @@
# NOTE: We substract the escaped brackets because they're part of node labels. # NOTE: We substract the escaped brackets because they're part of node labels.
# #
cat $input | awk '{print gsub("{","{")-gsub("\\\\{","\\{"), $0}' | \ cat $input | awk '{print gsub("{","{")-gsub("\\\\{","\\{"), $0}' | \
sort -n --buffer-size=4G | cut -d' ' -f2- sort -n --buffer-size=10G | cut -d' ' -f2-
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment