Commit 82e15b11 authored by Mateusz Pawlik's avatar Mateusz Pawlik
Browse files

Added linux kernel source - what I had before.

parent f1be0768
# diffs-large-trees-data
# Data collection for paper on Diffs for Large Trees
Instructions for each source are to be found in the source directory.
## List of data
The following data are currently in the collection:
- `linux` - directory trees of linux kernel repository
## Data statistics
Each source provides pairs of trees. Each line in the tables shows a single
tree pair.
|Source|File|Tree size|File|Tree size|TED|Size diff|
|------|----|--------:|----|--------:|--:|--------:|
|Linux |v5.6-rc3|214624|v5.6-rc4|214621|28|3|
\ No newline at end of file
# Directory trees of linux kernel repository
Execute `download.sh` script to download the raw data. Here, raw data is
an XML output of the GNU `tree` command, that is, output of `tree -X .`
executed in the root of the linux kernel git repository.
Execute the following to turn raw data into bracket notation.
```
mkdir bracket
python3 default_xml_to_bracket.py raw/v5.6-rc3.xml bracket/v5.6-rc3.bracket
python3 default_xml_to_bracket.py raw/v5.6-rc4.xml bracket/v5.6-rc4.bracket
```
\ No newline at end of file
#!/usr/bin/env python3
# The MIT License (MIT)
# Copyright (c) 2020 Mateusz Pawlik.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
from lxml import etree
import lxml.sax
from xml.sax.handler import ContentHandler
import argparse
from xml.sax import make_parser, handler
# This script converts XML to bracket notation as follows:
# TODO
# This class implements the sax-like events for converting XML elements into
# bracket-notation nodes and labels.
class DefaultContentHandler(handler.ContentHandler):
t = str.maketrans({
"{" : r"\{",
"}" : r"\}",
"\\": r"\\"
})
def __init__(self, file):
self.bn = ""
self.file = file
def startElement(self, name, attrs):
# print("{", name, sep='', end="", file=self.file)
self.file.write("{" + name)
# Sort the attributes by their keys.
for key, value in sorted(attrs.items(), key = lambda element : element[0][1]):
# pass
# self.file.write('s')
# print("{", key.translate(str.maketrans({"{": r"\{", "}": r"\}", "\\": r"\\"})),
# "{", value.translate(str.maketrans({"{": r"\{", "}": r"\}", "\\": r"\\"})), "}}", sep='', end='', file=self.file)
self.file.write("{" + key.translate(self.t) + "{" + value.translate(self.t) + "}}")
def endElement(self, name):
# pass
# self.file.write('e')
# print("}", sep='', end='', file=self.file)
self.file.write("}")
# Tag content.
def characters(self, content):
content = content.strip()
if content:
# pass
# self.file.write('c')
# print("{", content.translate(str.maketrans({"{": r"\{", "}": r"\}", "\\": r"\\"})), "}", sep='', end='', file=self.file)
self.file.write("{" + content.translate(self.t) + "}")
def endDocument(self):
self.file.close()
parser = argparse.ArgumentParser()
parser.add_argument(
type=str,
dest='input_xml',
help="Path to input XML file."
)
parser.add_argument(
type=str,
dest='output',
help="Path to output file."
)
args = parser.parse_args()
p = make_parser()
b = DefaultContentHandler(open(args.output, 'w'))
p.setContentHandler(b)
p.parse(args.input_xml)
#!/usr/bin/env bash
mkdir raw
cd raw
git clone https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
cd linux
git checkout v5.6-rc3
tree -X . > ../v5.6-rc3.xml
git checkout v5.6-rc4
tree -X . > ../v5.6-rc4.xml
cd ..
rm -rf linux
\ No newline at end of file
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment