Commit fd127aa5 authored by Thomas Huetter's avatar Thomas Huetter

xmark dataset

parent eba5bb56
File added
<!-- DTD for auction database -->
<!-- $Id: auction.dtd,v 1.15 2001/01/29 21:42:35 albrecht Exp $ -->
<!ELEMENT site (regions, categories, catgraph, people, open_auctions, closed_auctions)>
<!ELEMENT categories (category+)>
<!ELEMENT category (name, description)>
<!ATTLIST category id ID #REQUIRED>
<!ELEMENT name (#PCDATA)>
<!ELEMENT description (text | parlist)>
<!ELEMENT text (#PCDATA | bold | keyword | emph)*>
<!ELEMENT bold (#PCDATA | bold | keyword | emph)*>
<!ELEMENT keyword (#PCDATA | bold | keyword | emph)*>
<!ELEMENT emph (#PCDATA | bold | keyword | emph)*>
<!ELEMENT parlist (listitem)*>
<!ELEMENT listitem (text | parlist)*>
<!ELEMENT catgraph (edge*)>
<!ELEMENT edge EMPTY>
<!ATTLIST edge from IDREF #REQUIRED to IDREF #REQUIRED>
<!ELEMENT regions (africa, asia, australia, europe, namerica, samerica)>
<!ELEMENT africa (item*)>
<!ELEMENT asia (item*)>
<!ELEMENT australia (item*)>
<!ELEMENT namerica (item*)>
<!ELEMENT samerica (item*)>
<!ELEMENT europe (item*)>
<!ELEMENT item (location, quantity, name, payment, description, shipping, incategory+, mailbox)>
<!ATTLIST item id ID #REQUIRED
featured CDATA #IMPLIED>
<!ELEMENT location (#PCDATA)>
<!ELEMENT quantity (#PCDATA)>
<!ELEMENT payment (#PCDATA)>
<!ELEMENT shipping (#PCDATA)>
<!ELEMENT reserve (#PCDATA)>
<!ELEMENT incategory EMPTY>
<!ATTLIST incategory category IDREF #REQUIRED>
<!ELEMENT mailbox (mail*)>
<!ELEMENT mail (from, to, date, text)>
<!ELEMENT from (#PCDATA)>
<!ELEMENT to (#PCDATA)>
<!ELEMENT date (#PCDATA)>
<!ELEMENT itemref EMPTY>
<!ATTLIST itemref item IDREF #REQUIRED>
<!ELEMENT personref EMPTY>
<!ATTLIST personref person IDREF #REQUIRED>
<!ELEMENT people (person*)>
<!ELEMENT person (name, emailaddress, phone?, address?, homepage?, creditcard?, profile?, watches?)>
<!ATTLIST person id ID #REQUIRED>
<!ELEMENT emailaddress (#PCDATA)>
<!ELEMENT phone (#PCDATA)>
<!ELEMENT address (street, city, country, province?, zipcode)>
<!ELEMENT street (#PCDATA)>
<!ELEMENT city (#PCDATA)>
<!ELEMENT province (#PCDATA)>
<!ELEMENT zipcode (#PCDATA)>
<!ELEMENT country (#PCDATA)>
<!ELEMENT homepage (#PCDATA)>
<!ELEMENT creditcard (#PCDATA)>
<!ELEMENT profile (interest*, education?, gender?, business, age?)>
<!ATTLIST profile income CDATA #IMPLIED>
<!ELEMENT interest EMPTY>
<!ATTLIST interest category IDREF #REQUIRED>
<!ELEMENT education (#PCDATA)>
<!ELEMENT income (#PCDATA)>
<!ELEMENT gender (#PCDATA)>
<!ELEMENT business (#PCDATA)>
<!ELEMENT age (#PCDATA)>
<!ELEMENT watches (watch*)>
<!ELEMENT watch EMPTY>
<!ATTLIST watch open_auction IDREF #REQUIRED>
<!ELEMENT open_auctions (open_auction*)>
<!ELEMENT open_auction (initial, reserve?, bidder*, current, privacy?, itemref, seller, annotation, quantity, type, interval)>
<!ATTLIST open_auction id ID #REQUIRED>
<!ELEMENT privacy (#PCDATA)>
<!ELEMENT initial (#PCDATA)>
<!ELEMENT bidder (date, time, personref, increase)>
<!ELEMENT seller EMPTY>
<!ATTLIST seller person IDREF #REQUIRED>
<!ELEMENT current (#PCDATA)>
<!ELEMENT increase (#PCDATA)>
<!ELEMENT type (#PCDATA)>
<!ELEMENT interval (start, end)>
<!ELEMENT start (#PCDATA)>
<!ELEMENT end (#PCDATA)>
<!ELEMENT time (#PCDATA)>
<!ELEMENT status (#PCDATA)>
<!ELEMENT amount (#PCDATA)>
<!ELEMENT closed_auctions (closed_auction*)>
<!ELEMENT closed_auction (seller, buyer, itemref, price, date, quantity, type, annotation?)>
<!ELEMENT buyer EMPTY>
<!ATTLIST buyer person IDREF #REQUIRED>
<!ELEMENT price (#PCDATA)>
<!ELEMENT annotation (author, description?, happiness)>
<!ELEMENT author EMPTY>
<!ATTLIST author person IDREF #REQUIRED>
<!ELEMENT happiness (#PCDATA)>
#!/bin/bash
# Download the XML file.
wget -v http://www.ins.cwi.nl/projects/xmark/Assets/standard.gz
# Download the DTD file.
wget -v http://www.ins.cwi.nl/projects/xmark/Assets/auction.dtd
# Extract the XML file.
gzip -d standard.gz
\ No newline at end of file
#!/bin/bash
# | sort by number of nodes (equivalent to number of "{")
cat $1 | awk '{print gsub("{","{"), $0}' | sort -n | cut -d' ' -f2- > "${1%.bracket}_sorted.bracket"
This diff is collapsed.
from lxml import etree
import lxml.sax
from xml.sax.handler import ContentHandler
# This class implements the sax-like events for converting XML elemnts into
# bracket-notation nodes and labels.
class XMarkContentHandler(ContentHandler):
def __init__(self):
self.bn = ""
# Open tag.
def startElementNS(self, name, qname, attributes):
uri, localname = name
self.bn += "{" + localname
d = dict(attributes)
# Sort the attributes by their keys.
for key, value in sorted(d.items(), key = lambda element : element[0][1]):
self.bn += "{" + key[1].translate(str.maketrans({"{": r"\{",
"}": r"\}",
"\\": r"\\"})) + "{" + value.translate(str.maketrans({"{": r"\{",
"}": r"\}",
"\\": r"\\"})) + "}}"
# Close tag.
def endElementNS(self, name, qname):
self.bn += "}"
# Tag content.
def characters(self, data):
self.bn += "{" + data.translate(str.maketrans({"{": r"\{",
"}": r"\}",
"\\": r"\\"})) + "}"
print("--- Loading XMARK dataset.")
xmark_parser = etree.XMLParser(load_dtd=True, remove_blank_text=True)
xmark_data_tree = etree.parse('standard', xmark_parser)
root = xmark_data_tree.getroot()
# Output files.
xmark_bracket = open('xmark.bracket', 'w')
print("--- Processing each child of xmark's root.")
tree_id = 0
for child in root:
tree_id += 1
# Printing simple progress.
if tree_id % 10000 == 0:
print("- Tree %s" % (tree_id))
handler = XMarkContentHandler()
lxml.sax.saxify(child, handler)
xmark_bracket.write(handler.bn + "\n")
print("--- Closing output files.")
xmark_bracket.close()
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment