Commit deabcccb authored Oct 23, 2018 by Mateusz Pawlik

Finalized dblp and tested the entire pipeline.

parent 635388e3

dblp/download_prepare.sh

+8 −3

Original line number	Diff line number	Diff line
		@@ -18,9 +18,14 @@ gzip -d dblp-2017-11-01.xml.gz
		# Convert XML to bracket notation.
		./dblp_to_bracket.py

		# Remove 'www' entries.
		awk '!/{www{key{homepages/' dblp.bracket > dblp_no_www.bracket

		# Sort the dataset.
		./../utilities/sort_dataset.sh dblp.bracket
		./../utilities/sort_dataset.sh dblp_no_www.bracket > dblp_no_www_sorted.bracket

		# Tidy up.
		# rm xml
		# rm *.dtd
		rm xml
		rm *.dtd
		rm dblp.bracket
		rm dblp_no_www.bracket

dblp/tidy-up.sh

deleted100644 → 0

+0 −5

Original line number	Diff line number	Diff line
		#!/bin/bash

		# Delete all downloaded files.
		rm xml
		rm *.dtd

utilities/sort_dataset.sh

+1 −1

Original line number	Diff line number	Diff line
		@@ -32,4 +32,4 @@
		# NOTE: We substract the escaped brackets because they're part of node labels.
		#
		cat $input \| awk '{print gsub("{","{")-gsub("\\\\{","\\{"), $0}' \| \
		sort -n --buffer-size=4G \| cut -d' ' -f2-
		sort -n --buffer-size=10G \| cut -d' ' -f2-