Commit 0c22efd2 authored by Mateusz Pawlik's avatar Mateusz Pawlik

Deleted xmark dataset.

We don't use it. It is archived on Nextcloud in 'ted-join-leftovers'.
parent 2886ebdf
# X-Mark
NOT FINISHED.
\ No newline at end of file
#!/bin/bash
# Download the XML file.
wget -v http://www.ins.cwi.nl/projects/xmark/Assets/standard.gz
# Download the DTD file.
wget -v http://www.ins.cwi.nl/projects/xmark/Assets/auction.dtd
# Extract the XML file.
gzip -d standard.gz
\ No newline at end of file
#!/bin/bash
# | sort by number of nodes (equivalent to number of "{")
cat $1 | awk '{print gsub("{","{"), $0}' | sort -n | cut -d' ' -f2- > "${1%.bracket}_sorted.bracket"
from lxml import etree
import lxml.sax
from xml.sax.handler import ContentHandler
# This class implements the sax-like events for converting XML elemnts into
# bracket-notation nodes and labels.
class XMarkContentHandler(ContentHandler):
def __init__(self):
self.bn = ""
# Open tag.
def startElementNS(self, name, qname, attributes):
uri, localname = name
self.bn += "{" + localname
d = dict(attributes)
# Sort the attributes by their keys.
for key, value in sorted(d.items(), key = lambda element : element[0][1]):
self.bn += "{" + key[1].translate(str.maketrans({"{": r"\{",
"}": r"\}",
"\\": r"\\"})) + "{" + value.translate(str.maketrans({"{": r"\{",
"}": r"\}",
"\\": r"\\"})) + "}}"
# Close tag.
def endElementNS(self, name, qname):
self.bn += "}"
# Tag content.
def characters(self, data):
self.bn += "{" + data.translate(str.maketrans({"{": r"\{",
"}": r"\}",
"\\": r"\\"})) + "}"
print("--- Loading XMARK dataset.")
xmark_parser = etree.XMLParser(load_dtd=True, remove_blank_text=True)
xmark_data_tree = etree.parse('standard', xmark_parser)
root = xmark_data_tree.getroot()
# Output files.
xmark_bracket = open('xmark.bracket', 'w')
print("--- Processing each child of xmark's root.")
tree_id = 0
for child in root:
tree_id += 1
# Printing simple progress.
if tree_id % 1000 == 0:
print("- Tree %s" % (tree_id))
handler = XMarkContentHandler()
lxml.sax.saxify(child, handler)
xmark_bracket.write(handler.bn + "\n")
print("--- Closing output files.")
xmark_bracket.close()
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment