Commit 0c22efd2 authored by Mateusz Pawlik's avatar Mateusz Pawlik
Browse files

Deleted xmark dataset.

We don't use it. It is archived on Nextcloud in 'ted-join-leftovers'.
parent 2886ebdf
Loading
Loading
Loading
Loading

xmark/README.md

deleted100644 → 0
+0 −3
Original line number Diff line number Diff line
# X-Mark

NOT FINISHED.
 No newline at end of file

xmark/download.sh

deleted100755 → 0
+0 −10
Original line number Diff line number Diff line
#!/bin/bash

# Download the XML file.
wget -v http://www.ins.cwi.nl/projects/xmark/Assets/standard.gz

# Download the DTD file.
wget -v http://www.ins.cwi.nl/projects/xmark/Assets/auction.dtd

# Extract the XML file.
gzip -d standard.gz
 No newline at end of file

xmark/sort_dataset.sh

deleted100755 → 0
+0 −4
Original line number Diff line number Diff line
#!/bin/bash

#      | sort by number of nodes (equivalent to number of "{")
cat $1 | awk '{print gsub("{","{"), $0}' | sort -n  | cut -d' ' -f2- > "${1%.bracket}_sorted.bracket"

xmark/xmark_to_bracket.py

deleted100644 → 0
+0 −57
Original line number Diff line number Diff line
from lxml import etree
import lxml.sax
from xml.sax.handler import ContentHandler

# This class implements the sax-like events for converting XML elemnts into
# bracket-notation nodes and labels.
class XMarkContentHandler(ContentHandler):
    def __init__(self):
        self.bn = ""

    # Open tag.
    def startElementNS(self, name, qname, attributes):
        uri, localname = name
        self.bn += "{" + localname
        d = dict(attributes)
        # Sort the attributes by their keys.
        for key, value in sorted(d.items(), key = lambda element : element[0][1]):
            self.bn += "{" + key[1].translate(str.maketrans({"{":  r"\{",
                                          "}":  r"\}",
                                          "\\": r"\\"})) + "{" + value.translate(str.maketrans({"{":  r"\{",
                                          "}":  r"\}",
                                          "\\": r"\\"})) + "}}"

    # Close tag.
    def endElementNS(self, name, qname):
        self.bn += "}"

    # Tag content.
    def characters(self, data):
        self.bn += "{" + data.translate(str.maketrans({"{":  r"\{",
                                          "}":  r"\}",
                                          "\\": r"\\"})) + "}"

print("--- Loading XMARK dataset.")

xmark_parser = etree.XMLParser(load_dtd=True, remove_blank_text=True)
xmark_data_tree = etree.parse('standard', xmark_parser)
root = xmark_data_tree.getroot()

# Output files.
xmark_bracket = open('xmark.bracket', 'w')

print("--- Processing each child of xmark's root.")

tree_id = 0
for child in root:
    tree_id += 1
    # Printing simple progress.
    if tree_id % 1000 == 0:
        print("- Tree %s" % (tree_id))
    handler = XMarkContentHandler()
    lxml.sax.saxify(child, handler)
    xmark_bracket.write(handler.bn + "\n")

print("--- Closing output files.")
xmark_bracket.close()