Commit f9b4a2bc authored by Thomas Huetter's avatar Thomas Huetter
Browse files

delete xmark.bracket

parent 80deebe0
Loading
Loading
Loading
Loading

xmark/xmark.bracket

deleted100644 → 0
+0 −0

Empty file deleted.

+57 −0
Original line number Diff line number Diff line
from lxml import etree
import lxml.sax
from xml.sax.handler import ContentHandler

# This class implements the sax-like events for converting XML elemnts into
# bracket-notation nodes and labels.
class XMarkContentHandler(ContentHandler):
    def __init__(self):
        self.bn = ""

    # Open tag.
    def startElementNS(self, name, qname, attributes):
        uri, localname = name
        self.bn += "{" + localname
        d = dict(attributes)
        # Sort the attributes by their keys.
        for key, value in sorted(d.items(), key = lambda element : element[0][1]):
            self.bn += "{" + key[1].translate(str.maketrans({"{":  r"\{",
                                          "}":  r"\}",
                                          "\\": r"\\"})) + "{" + value.translate(str.maketrans({"{":  r"\{",
                                          "}":  r"\}",
                                          "\\": r"\\"})) + "}}"

    # Close tag.
    def endElementNS(self, name, qname):
        self.bn += "}"

    # Tag content.
    def characters(self, data):
        self.bn += "{" + data.translate(str.maketrans({"{":  r"\{",
                                          "}":  r"\}",
                                          "\\": r"\\"})) + "}"

print("--- Loading XMARK dataset.")

xmark_parser = etree.XMLParser(load_dtd=True, remove_blank_text=True)
xmark_data_tree = etree.parse('standard', xmark_parser)
root = xmark_data_tree.getroot()

# Output files.
xmark_bracket = open('xmark.bracket', 'w')

print("--- Processing each child of xmark's root.")

tree_id = 0
for child in root:
    tree_id += 1
    # Printing simple progress.
    if tree_id % 10000 == 0:
        print("- Tree %s" % (tree_id))
    handler = XMarkContentHandler()
    lxml.sax.saxify(child, handler)
    xmark_bracket.write(handler.bn + "\n")

print("--- Closing output files.")
xmark_bracket.close()