Commit f9b4a2bc authored by Thomas Huetter's avatar Thomas Huetter

delete xmark.bracket

parent 80deebe0
from lxml import etree
import lxml.sax
from xml.sax.handler import ContentHandler
# This class implements the sax-like events for converting XML elemnts into
# bracket-notation nodes and labels.
class XMarkContentHandler(ContentHandler):
def __init__(self):
self.bn = ""
# Open tag.
def startElementNS(self, name, qname, attributes):
uri, localname = name
self.bn += "{" + localname
d = dict(attributes)
# Sort the attributes by their keys.
for key, value in sorted(d.items(), key = lambda element : element[0][1]):
self.bn += "{" + key[1].translate(str.maketrans({"{": r"\{",
"}": r"\}",
"\\": r"\\"})) + "{" + value.translate(str.maketrans({"{": r"\{",
"}": r"\}",
"\\": r"\\"})) + "}}"
# Close tag.
def endElementNS(self, name, qname):
self.bn += "}"
# Tag content.
def characters(self, data):
self.bn += "{" + data.translate(str.maketrans({"{": r"\{",
"}": r"\}",
"\\": r"\\"})) + "}"
print("--- Loading XMARK dataset.")
xmark_parser = etree.XMLParser(load_dtd=True, remove_blank_text=True)
xmark_data_tree = etree.parse('standard', xmark_parser)
root = xmark_data_tree.getroot()
# Output files.
xmark_bracket = open('xmark.bracket', 'w')
print("--- Processing each child of xmark's root.")
tree_id = 0
for child in root:
tree_id += 1
# Printing simple progress.
if tree_id % 10000 == 0:
print("- Tree %s" % (tree_id))
handler = XMarkContentHandler()
lxml.sax.saxify(child, handler)
xmark_bracket.write(handler.bn + "\n")
print("--- Closing output files.")
xmark_bracket.close()
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment