Commit eba5bb56 authored by Thomas Huetter's avatar Thomas Huetter
Browse files

workaround: escaping '\', '{' and '}' for dblp and swissprot

parent 03845e51
Loading
Loading
Loading
Loading
+8 −2
Original line number Original line Diff line number Diff line
@@ -15,7 +15,11 @@ class DBLPContentHandler(ContentHandler):
        d = dict(attributes)
        d = dict(attributes)
        # Sort the attributes by their keys.
        # Sort the attributes by their keys.
        for key, value in sorted(d.items(), key = lambda element : element[0][1]):
        for key, value in sorted(d.items(), key = lambda element : element[0][1]):
            self.bn += "{" + key[1] + "{" + value + "}}"
            self.bn += "{" + key[1].translate(str.maketrans({"{":  r"\{",
                                          "}":  r"\}",
                                          "\\": r"\\"})) + "{" + value.translate(str.maketrans({"{":  r"\{",
                                          "}":  r"\}",
                                          "\\": r"\\"})) + "}}"


    # Close tag.
    # Close tag.
    def endElementNS(self, name, qname):
    def endElementNS(self, name, qname):
@@ -23,7 +27,9 @@ class DBLPContentHandler(ContentHandler):


    # Tag content.
    # Tag content.
    def characters(self, data):
    def characters(self, data):
        self.bn += "{" + data + "}"
        self.bn += "{" + data.translate(str.maketrans({"{":  r"\{",
                                          "}":  r"\}",
                                          "\\": r"\\"})) + "}"


print("--- Loading DBLP dataset.")
print("--- Loading DBLP dataset.")


+9 −3
Original line number Original line Diff line number Diff line
@@ -15,7 +15,11 @@ class SwissprotContentHandler(ContentHandler):
        d = dict(attributes)
        d = dict(attributes)
        # Sort the attributes by their keys.
        # Sort the attributes by their keys.
        for key, value in sorted(d.items(), key = lambda element : element[0][1]):
        for key, value in sorted(d.items(), key = lambda element : element[0][1]):
            self.bn += "{" + key[1] + "{" + value + "}}"
            self.bn += "{" + key[1].translate(str.maketrans({"{":  r"\{",
                                          "}":  r"\}",
                                          "\\": r"\\"})) + "{" + value.translate(str.maketrans({"{":  r"\{",
                                          "}":  r"\}",
                                          "\\": r"\\"})) + "}}"


    # Close tag.
    # Close tag.
    def endElementNS(self, name, qname):
    def endElementNS(self, name, qname):
@@ -23,12 +27,14 @@ class SwissprotContentHandler(ContentHandler):


    # Tag content.
    # Tag content.
    def characters(self, data):
    def characters(self, data):
        self.bn += "{" + data + "}"
        self.bn += "{" + data.translate(str.maketrans({"{":  r"\{",
                                          "}":  r"\}",
                                          "\\": r"\\"})) + "}"


print("--- Loading Swissprot dataset.")
print("--- Loading Swissprot dataset.")


swissprot_parser = etree.XMLParser(load_dtd=False, remove_blank_text=True)
swissprot_parser = etree.XMLParser(load_dtd=False, remove_blank_text=True)
swissprot_data_tree = etree.parse('uniprot_sprot.xml', dblp_parser)
swissprot_data_tree = etree.parse('uniprot_sprot.xml', swissprot_parser)
root = swissprot_data_tree.getroot()
root = swissprot_data_tree.getroot()


# Output files.
# Output files.