Commit deabcccb authored by Mateusz Pawlik's avatar Mateusz Pawlik
Browse files

Finalized dblp and tested the entire pipeline.

parent 635388e3
Loading
Loading
Loading
Loading
+8 −3
Original line number Diff line number Diff line
@@ -18,9 +18,14 @@ gzip -d dblp-2017-11-01.xml.gz
# Convert XML to bracket notation.
./dblp_to_bracket.py

# Remove 'www' entries.
awk '!/{www{key{homepages/' dblp.bracket > dblp_no_www.bracket

# Sort the dataset.
./../utilities/sort_dataset.sh dblp.bracket
./../utilities/sort_dataset.sh dblp_no_www.bracket > dblp_no_www_sorted.bracket

# Tidy up.
# rm *xml*
# rm *.dtd
rm *xml*
rm *.dtd
rm dblp.bracket
rm dblp_no_www.bracket

dblp/tidy-up.sh

deleted100644 → 0
+0 −5
Original line number Diff line number Diff line
#!/bin/bash

# Delete all downloaded files.
rm *xml*
rm *.dtd
+1 −1
Original line number Diff line number Diff line
@@ -32,4 +32,4 @@
# NOTE: We substract the escaped brackets because they're part of node labels.
# 
cat $input | awk '{print gsub("{","{")-gsub("\\\\{","\\{"), $0}' | \
sort -n --buffer-size=4G | cut -d' ' -f2-
sort -n --buffer-size=10G | cut -d' ' -f2-