Commit 80deebe0 authored by Thomas Huetter's avatar Thomas Huetter
Browse files

removed datafiles

parent fd127aa5
Loading
Loading
Loading
Loading

xmark/auction.dtd

deleted100644 → 0
+0 −102
Original line number Diff line number Diff line
<!-- DTD for auction database -->
<!-- $Id: auction.dtd,v 1.15 2001/01/29 21:42:35 albrecht Exp $ -->

<!ELEMENT site            (regions, categories, catgraph, people, open_auctions, closed_auctions)>

<!ELEMENT categories      (category+)>
<!ELEMENT category        (name, description)>
<!ATTLIST category        id ID #REQUIRED>
<!ELEMENT name            (#PCDATA)>
<!ELEMENT description     (text | parlist)>
<!ELEMENT text            (#PCDATA | bold | keyword | emph)*>
<!ELEMENT bold		  (#PCDATA | bold | keyword | emph)*>
<!ELEMENT keyword	  (#PCDATA | bold | keyword | emph)*>
<!ELEMENT emph		  (#PCDATA | bold | keyword | emph)*>
<!ELEMENT parlist	  (listitem)*>
<!ELEMENT listitem        (text | parlist)*>

<!ELEMENT catgraph        (edge*)>
<!ELEMENT edge            EMPTY>
<!ATTLIST edge            from IDREF #REQUIRED to IDREF #REQUIRED>

<!ELEMENT regions         (africa, asia, australia, europe, namerica, samerica)>
<!ELEMENT africa          (item*)>
<!ELEMENT asia            (item*)>
<!ELEMENT australia       (item*)>
<!ELEMENT namerica        (item*)>
<!ELEMENT samerica        (item*)>
<!ELEMENT europe          (item*)>
<!ELEMENT item            (location, quantity, name, payment, description, shipping, incategory+, mailbox)>
<!ATTLIST item            id ID #REQUIRED
                          featured CDATA #IMPLIED>
<!ELEMENT location        (#PCDATA)>
<!ELEMENT quantity        (#PCDATA)>
<!ELEMENT payment         (#PCDATA)>
<!ELEMENT shipping        (#PCDATA)>
<!ELEMENT reserve         (#PCDATA)>
<!ELEMENT incategory      EMPTY>
<!ATTLIST incategory      category IDREF #REQUIRED>
<!ELEMENT mailbox         (mail*)>
<!ELEMENT mail            (from, to, date, text)>
<!ELEMENT from            (#PCDATA)>
<!ELEMENT to              (#PCDATA)>
<!ELEMENT date            (#PCDATA)>
<!ELEMENT itemref         EMPTY>
<!ATTLIST itemref         item IDREF #REQUIRED>
<!ELEMENT personref       EMPTY>
<!ATTLIST personref       person IDREF #REQUIRED>

<!ELEMENT people          (person*)>
<!ELEMENT person          (name, emailaddress, phone?, address?, homepage?, creditcard?, profile?, watches?)>
<!ATTLIST person          id ID #REQUIRED>
<!ELEMENT emailaddress    (#PCDATA)>
<!ELEMENT phone           (#PCDATA)>
<!ELEMENT address         (street, city, country, province?, zipcode)>
<!ELEMENT street          (#PCDATA)>
<!ELEMENT city            (#PCDATA)>
<!ELEMENT province        (#PCDATA)>
<!ELEMENT zipcode         (#PCDATA)>
<!ELEMENT country         (#PCDATA)>
<!ELEMENT homepage        (#PCDATA)>
<!ELEMENT creditcard      (#PCDATA)>
<!ELEMENT profile         (interest*, education?, gender?, business, age?)>
<!ATTLIST profile         income CDATA #IMPLIED>
<!ELEMENT interest        EMPTY>
<!ATTLIST interest        category IDREF #REQUIRED>
<!ELEMENT education       (#PCDATA)>
<!ELEMENT income          (#PCDATA)>
<!ELEMENT gender          (#PCDATA)>
<!ELEMENT business        (#PCDATA)>
<!ELEMENT age             (#PCDATA)>
<!ELEMENT watches         (watch*)>
<!ELEMENT watch           EMPTY>
<!ATTLIST watch           open_auction IDREF #REQUIRED>

<!ELEMENT open_auctions   (open_auction*)>
<!ELEMENT open_auction    (initial, reserve?, bidder*, current, privacy?, itemref, seller, annotation, quantity, type, interval)>
<!ATTLIST open_auction    id ID #REQUIRED>
<!ELEMENT privacy         (#PCDATA)>
<!ELEMENT initial         (#PCDATA)>
<!ELEMENT bidder          (date, time, personref, increase)>
<!ELEMENT seller          EMPTY>
<!ATTLIST seller          person IDREF #REQUIRED>
<!ELEMENT current         (#PCDATA)>
<!ELEMENT increase        (#PCDATA)>
<!ELEMENT type            (#PCDATA)>
<!ELEMENT interval        (start, end)>
<!ELEMENT start           (#PCDATA)>
<!ELEMENT end             (#PCDATA)>
<!ELEMENT time            (#PCDATA)>
<!ELEMENT status          (#PCDATA)>
<!ELEMENT amount          (#PCDATA)>

<!ELEMENT closed_auctions (closed_auction*)>
<!ELEMENT closed_auction  (seller, buyer, itemref, price, date, quantity, type, annotation?)>
<!ELEMENT buyer           EMPTY>
<!ATTLIST buyer           person IDREF #REQUIRED>
<!ELEMENT price           (#PCDATA)>
<!ELEMENT annotation      (author, description?, happiness)>

<!ELEMENT author          EMPTY>
<!ATTLIST author          person IDREF #REQUIRED>
<!ELEMENT happiness       (#PCDATA)>

xmark/standard

deleted100644 → 0
+0 −0

File deleted.

Preview suppressed by a .gitattributes entry or the file's encoding is unsupported.

xmark/xmark_to_bracket.py

deleted100644 → 0
+0 −56
Original line number Diff line number Diff line
from lxml import etree
import lxml.sax
from xml.sax.handler import ContentHandler

# This class implements the sax-like events for converting XML elemnts into
# bracket-notation nodes and labels.
class XMarkContentHandler(ContentHandler):
    def __init__(self):
        self.bn = ""

    # Open tag.
    def startElementNS(self, name, qname, attributes):
        uri, localname = name
        self.bn += "{" + localname
        d = dict(attributes)
        # Sort the attributes by their keys.
        for key, value in sorted(d.items(), key = lambda element : element[0][1]):
            self.bn += "{" + key[1].translate(str.maketrans({"{":  r"\{",
                                          "}":  r"\}",
                                          "\\": r"\\"})) + "{" + value.translate(str.maketrans({"{":  r"\{",
                                          "}":  r"\}",
                                          "\\": r"\\"})) + "}}"

    # Close tag.
    def endElementNS(self, name, qname):
        self.bn += "}"

    # Tag content.
    def characters(self, data):
        self.bn += "{" + data.translate(str.maketrans({"{":  r"\{",
                                          "}":  r"\}",
                                          "\\": r"\\"})) + "}"

print("--- Loading XMARK dataset.")

xmark_parser = etree.XMLParser(load_dtd=True, remove_blank_text=True)
xmark_data_tree = etree.parse('standard', xmark_parser)
root = xmark_data_tree.getroot()

# Output files.
xmark_bracket = open('xmark.bracket', 'w')

print("--- Processing each child of xmark's root.")

tree_id = 0
for child in root:
    tree_id += 1
    # Printing simple progress.
    if tree_id % 10000 == 0:
        print("- Tree %s" % (tree_id))
    handler = XMarkContentHandler()
    lxml.sax.saxify(child, handler)
    xmark_bracket.write(handler.bn + "\n")

print("--- Closing output files.")
xmark_bracket.close()