Commit 50342882 authored by Mateusz Pawlik's avatar Mateusz Pawlik
Browse files

Added conversion B used in reproducibility paper.

parent 393ca3e0
#!/usr/bin/env python3
# The MIT License (MIT)
# Copyright (c) 2017 Thomas Hütter, Mateusz Pawlik.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
from lxml import etree
import lxml.sax
from xml.sax.handler import ContentHandler
# This script converts DBLP from XML to bracket notation.
# NOTE: Filenames are hardcoded in this script.
# This class implements the sax-like events for converting XML elements into
# bracket-notation nodes and labels.
class DBLPContentHandler(ContentHandler):
def __init__(self):
self.bn = ""
self.in_title = False
# Open tag.
def startElementNS(self, name, qname, attributes):
uri, localname = name
#if localname == 'title':
# self.in_title = True
#if localname not in ['sub','sup','i','tt','ref']:
if localname == 'title':
self.in_title = True
self.bn += "{" + localname + "{"
else:
if not self.in_title:
self.bn += "{" + localname
# self.in_title = True
# d = dict(attributes)
# # Sort the attributes by their keys.
# for key, value in sorted(d.items(), key = lambda element : element[0][1]):
# self.bn += "{" + key[1].translate(str.maketrans({"{": r"\{",
# "}": r"\}",
# "\\": r"\\"})) + "{" + value.translate(str.maketrans({"{": r"\{",
# "}": r"\}",
# "\\": r"\\"})) + "}}"
# Close tag.
def endElementNS(self, name, qname):
uri, localname = name
#if localname == 'title':
# self.in_title = False
#if localname not in ['sub','sup','i','tt','ref']:
if localname == 'title':
self.bn += "}}"
self.in_title = False
else:
if not self.in_title:
self.bn += "}"
#self.in_title = False
# Tag content.
def characters(self, data):
l_bracket = "{"
r_bracket = "}"
if self.in_title:
l_bracket = ""
r_bracket = ""
self.bn += l_bracket + data.translate(str.maketrans({"{": r"\{",
"}": r"\}",
"\\": r"\\"})) + r_bracket
print("--- Loading DBLP dataset.")
dblp_parser = etree.XMLParser(load_dtd=True, remove_blank_text=True)
dblp_data_tree = etree.parse('dblp-2017-11-01.xml', dblp_parser)
root = dblp_data_tree.getroot()
# Output files.
dblp_bracket = open('dblp_c2.bracket', 'w')
print("--- Processing each child of DBLP's root.")
tree_id = 0
for child in root:
tree_id += 1
# Printing simple progress.
if tree_id % 100000 == 0:
print("- Tree %s" % (tree_id))
handler = DBLPContentHandler()
lxml.sax.saxify(child, handler)
dblp_bracket.write(handler.bn + "\n")
print("--- Closing output files.")
dblp_bracket.close()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment