dblp_to_bracket.py 1.4 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34
from lxml import etree
import lxml.sax
from xml.sax.handler import ContentHandler

# This class implements the sax-like events for converting XML elemnts into
# bracket-notation nodes and labels.
class DBLPContentHandler(ContentHandler):
    def __init__(self):
        self.bn = ""

    # Open tag.
    def startElementNS(self, name, qname, attributes):
        uri, localname = name
        self.bn += "{" + localname
        d = dict(attributes)
        # Sort the attributes by their keys.
        for key, value in sorted(d.items(), key = lambda element : element[0][1]):
            self.bn += "{" + key[1] + "{" + value + "}}"

    # Close tag.
    def endElementNS(self, name, qname):
        self.bn += "}"

    # Tag content.
    def characters(self, data):
        self.bn += "{" + data + "}"

print("--- Loading DBLP dataset.")

dblp_parser = etree.XMLParser(load_dtd=True, remove_blank_text=True)
dblp_data_tree = etree.parse('dblp-2017-11-01.xml', dblp_parser)
root = dblp_data_tree.getroot()

# Output files.
35
dblp_bracket = open('dblp.bracket', 'w')
36 37 38 39 40 41 42 43 44 45 46 47 48 49 50

print("--- Processing each child of DBLP's root.")

tree_id = 0
for child in root:
    tree_id += 1
    # Printing simple progress.
    if tree_id % 100000 == 0:
        print("- Tree %s" % (tree_id))
    handler = DBLPContentHandler()
    lxml.sax.saxify(child, handler)
    dblp_bracket.write(handler.bn + "\n")

print("--- Closing output files.")
dblp_bracket.close()