Commit eba5bb56 authored by Thomas Huetter's avatar Thomas Huetter

workaround: escaping '\', '{' and '}' for dblp and swissprot

parent 03845e51
...@@ -15,7 +15,11 @@ class DBLPContentHandler(ContentHandler): ...@@ -15,7 +15,11 @@ class DBLPContentHandler(ContentHandler):
d = dict(attributes) d = dict(attributes)
# Sort the attributes by their keys. # Sort the attributes by their keys.
for key, value in sorted(d.items(), key = lambda element : element[0][1]): for key, value in sorted(d.items(), key = lambda element : element[0][1]):
self.bn += "{" + key[1] + "{" + value + "}}" self.bn += "{" + key[1].translate(str.maketrans({"{": r"\{",
"}": r"\}",
"\\": r"\\"})) + "{" + value.translate(str.maketrans({"{": r"\{",
"}": r"\}",
"\\": r"\\"})) + "}}"
# Close tag. # Close tag.
def endElementNS(self, name, qname): def endElementNS(self, name, qname):
...@@ -23,7 +27,9 @@ class DBLPContentHandler(ContentHandler): ...@@ -23,7 +27,9 @@ class DBLPContentHandler(ContentHandler):
# Tag content. # Tag content.
def characters(self, data): def characters(self, data):
self.bn += "{" + data + "}" self.bn += "{" + data.translate(str.maketrans({"{": r"\{",
"}": r"\}",
"\\": r"\\"})) + "}"
print("--- Loading DBLP dataset.") print("--- Loading DBLP dataset.")
......
...@@ -15,7 +15,11 @@ class SwissprotContentHandler(ContentHandler): ...@@ -15,7 +15,11 @@ class SwissprotContentHandler(ContentHandler):
d = dict(attributes) d = dict(attributes)
# Sort the attributes by their keys. # Sort the attributes by their keys.
for key, value in sorted(d.items(), key = lambda element : element[0][1]): for key, value in sorted(d.items(), key = lambda element : element[0][1]):
self.bn += "{" + key[1] + "{" + value + "}}" self.bn += "{" + key[1].translate(str.maketrans({"{": r"\{",
"}": r"\}",
"\\": r"\\"})) + "{" + value.translate(str.maketrans({"{": r"\{",
"}": r"\}",
"\\": r"\\"})) + "}}"
# Close tag. # Close tag.
def endElementNS(self, name, qname): def endElementNS(self, name, qname):
...@@ -23,12 +27,14 @@ class SwissprotContentHandler(ContentHandler): ...@@ -23,12 +27,14 @@ class SwissprotContentHandler(ContentHandler):
# Tag content. # Tag content.
def characters(self, data): def characters(self, data):
self.bn += "{" + data + "}" self.bn += "{" + data.translate(str.maketrans({"{": r"\{",
"}": r"\}",
"\\": r"\\"})) + "}"
print("--- Loading Swissprot dataset.") print("--- Loading Swissprot dataset.")
swissprot_parser = etree.XMLParser(load_dtd=False, remove_blank_text=True) swissprot_parser = etree.XMLParser(load_dtd=False, remove_blank_text=True)
swissprot_data_tree = etree.parse('uniprot_sprot.xml', dblp_parser) swissprot_data_tree = etree.parse('uniprot_sprot.xml', swissprot_parser)
root = swissprot_data_tree.getroot() root = swissprot_data_tree.getroot()
# Output files. # Output files.
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment