Loading xmark/xmark.bracketdeleted 100644 → 0 +0 −0 Empty file deleted. xmark/xmark_to_bracket.py 0 → 100644 +57 −0 Original line number Diff line number Diff line from lxml import etree import lxml.sax from xml.sax.handler import ContentHandler # This class implements the sax-like events for converting XML elemnts into # bracket-notation nodes and labels. class XMarkContentHandler(ContentHandler): def __init__(self): self.bn = "" # Open tag. def startElementNS(self, name, qname, attributes): uri, localname = name self.bn += "{" + localname d = dict(attributes) # Sort the attributes by their keys. for key, value in sorted(d.items(), key = lambda element : element[0][1]): self.bn += "{" + key[1].translate(str.maketrans({"{": r"\{", "}": r"\}", "\\": r"\\"})) + "{" + value.translate(str.maketrans({"{": r"\{", "}": r"\}", "\\": r"\\"})) + "}}" # Close tag. def endElementNS(self, name, qname): self.bn += "}" # Tag content. def characters(self, data): self.bn += "{" + data.translate(str.maketrans({"{": r"\{", "}": r"\}", "\\": r"\\"})) + "}" print("--- Loading XMARK dataset.") xmark_parser = etree.XMLParser(load_dtd=True, remove_blank_text=True) xmark_data_tree = etree.parse('standard', xmark_parser) root = xmark_data_tree.getroot() # Output files. xmark_bracket = open('xmark.bracket', 'w') print("--- Processing each child of xmark's root.") tree_id = 0 for child in root: tree_id += 1 # Printing simple progress. if tree_id % 10000 == 0: print("- Tree %s" % (tree_id)) handler = XMarkContentHandler() lxml.sax.saxify(child, handler) xmark_bracket.write(handler.bn + "\n") print("--- Closing output files.") xmark_bracket.close() Loading
xmark/xmark_to_bracket.py 0 → 100644 +57 −0 Original line number Diff line number Diff line from lxml import etree import lxml.sax from xml.sax.handler import ContentHandler # This class implements the sax-like events for converting XML elemnts into # bracket-notation nodes and labels. class XMarkContentHandler(ContentHandler): def __init__(self): self.bn = "" # Open tag. def startElementNS(self, name, qname, attributes): uri, localname = name self.bn += "{" + localname d = dict(attributes) # Sort the attributes by their keys. for key, value in sorted(d.items(), key = lambda element : element[0][1]): self.bn += "{" + key[1].translate(str.maketrans({"{": r"\{", "}": r"\}", "\\": r"\\"})) + "{" + value.translate(str.maketrans({"{": r"\{", "}": r"\}", "\\": r"\\"})) + "}}" # Close tag. def endElementNS(self, name, qname): self.bn += "}" # Tag content. def characters(self, data): self.bn += "{" + data.translate(str.maketrans({"{": r"\{", "}": r"\}", "\\": r"\\"})) + "}" print("--- Loading XMARK dataset.") xmark_parser = etree.XMLParser(load_dtd=True, remove_blank_text=True) xmark_data_tree = etree.parse('standard', xmark_parser) root = xmark_data_tree.getroot() # Output files. xmark_bracket = open('xmark.bracket', 'w') print("--- Processing each child of xmark's root.") tree_id = 0 for child in root: tree_id += 1 # Printing simple progress. if tree_id % 10000 == 0: print("- Tree %s" % (tree_id)) handler = XMarkContentHandler() lxml.sax.saxify(child, handler) xmark_bracket.write(handler.bn + "\n") print("--- Closing output files.") xmark_bracket.close()