Commit 07a19aa4 authored by Mateusz Pawlik's avatar Mateusz Pawlik

Merge branch 'develop'

parents f3ab7750 bb064f34
*.DS_Store
cmake_minimum_required(VERSION 2.8)
# Create the project.
project(ted-experiments)
# Compiler flags.
# MUST be declared after project().
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -Wall -O3")
# Verify which timing implementation to use.
# NOTE: Comes from common-code.
include(CheckSymbolExists)
CHECK_SYMBOL_EXISTS(clock_gettime "time.h" HAVE_CLOCK_GETTIME)
if(NOT HAVE_CLOCK_GETTIME)
add_definitions(-DGETRUSAGE)
endif()
# Build executable with the experiments.
add_executable(
ted-join-experiments # EXECUTABLE NAME
src/join_algs/join_algs_experiments.cc # EXECUTABLE SOURCE
)
target_link_libraries(
ted-join-experiments # TARGET EXECUTABLE NAME
TimingLibrary # LIBRARY NAME
)
target_include_directories(
ted-join-experiments # TARGET EXECUTABLE NAME
PUBLIC external/common-code/timings # HEADER FILE
)
# Build executable with the experiments.
add_executable(
ted-algs-experiments # EXECUTABLE NAME
src/ted_algs/ted_algs_experiments.cc # EXECUTABLE SOURCE
)
target_link_libraries(
ted-algs-experiments # TARGET EXECUTABLE NAME
TimingLibrary # LIBRARY NAME
)
target_include_directories(
ted-algs-experiments # TARGET EXECUTABLE NAME
PUBLIC external/common-code/timings # HEADER FILE
)
# Create the timing library.
add_library(
TimingLibrary # LIBRARY NAME
external/common-code/timings/timing.cxx # LIBRARY SOURCE
)
# Create header-only library 'TreeSimilarity' with our algorithms.
add_library(
TreeSimilarity INTERFACE
)
target_include_directories(
TreeSimilarity INTERFACE
external/tree-similarity/src/cost_model
external/tree-similarity/src/data_structures
external/tree-similarity/src/join
external/tree-similarity/src/join/tjoin
external/tree-similarity/src/join/tang
external/tree-similarity/src/join/naive
external/tree-similarity/src/join/binary_branches
external/tree-similarity/src/join/label_histogram
external/tree-similarity/src/join/degree_histogram
external/tree-similarity/src/join/leaf_dist_histogram
external/tree-similarity/src/join/histogram
external/tree-similarity/src/label
external/tree-similarity/src/node
external/tree-similarity/src/parser
external/tree-similarity/src/ted
external/tree-similarity/src/ted_ub
external/tree-similarity/src/ted_lb
external/tree-similarity/src/join/guha
)
# Let the compiler know to find the header files in TreeSimilarity library.
target_link_libraries(
ted-join-experiments # EXECUTABLE NAME
TreeSimilarity # LIBRARY NAME
)
# Let the compiler know to find the header files in TreeSimilarity library.
target_link_libraries(
ted-algs-experiments # EXECUTABLE NAME
TreeSimilarity # LIBRARY NAME
)
# Tree Edit Distance similarity join - experiments
# Tree Edit Distance Experiments
Currently the experiments framework contains stand-alone tree edit distance
and tree similarity join algorithms.
Follow the instructions below to reproduce the environment and the experiments.
## ICDE 2019 Reproducibility
This repository contains experiments of our ICDE 2019 paper
[Effective Filters and Linear Time Verification for Tree Similarity Joins](http://eplus.uni-salzburg.at/obvusboa/download/pdf/4486886).
To reproduce the experiments of the ICDE 2019 paper, checkout the tag
`icde2019` of this and
[Tree Similarity library](https://github.com/DatabaseGroup/tree-similarity/tree/develop)
repositories.
Obtain datasets from our
[Datasets repository](https://frosch.cosy.sbg.ac.at/mpawlik/ted-datasets).
Execute the experiments with all config files in `configs/icde2019` directory.
See execution details below. You may need to modify `--dataset_path` parameter
value when executing the experiments.
For LGM Upper Bound and BSM verification experiment, certain views must present
in the databse. After executing all experiments, execute `src/ted_algs/view_queries.sql`
on the database holding the experiment results.
Plot the results using `src/plots/create_all_plots.sh` file from `src/plots/`
directory.
## Build the project
After cloning the repository, clone the external libraries to `external`
subdirectory.
```bash
mkdir external
cd external
```
Clone the Timing library for runtime measurements.
```bash
git clone git@frosch.cosy.sbg.ac.at:wmann/common-code.git
```
Clone the Tree Similarity library with the algorithms (the `develop` branch
is currently the most recent).
```bash
git clone --branch develop https://github.com/DatabaseGroup/tree-similarity.git
```
Then execute the following from the project's root directory.
```bash
mkdir build
cd build
cmake ..
make
```
## Prepare a PostgreSQL database for storing the results
Install [PostgreSQL](https://www.postgresql.org/).
Create a database using the SQL file ``db/create_db.sql``.
Create a service file ``~/.pg_service.conf`` on the machine where you execute
the experiments. The service file holds the connection details to the database
where the results will be stored. An example service file looks as follows.
```
[ted-exp]
host=mydb.sbg.ac.at
port=5432
user=ted
password=letmethrough
dbname=ted_experiments
```
Executing experiments requires dataset details to be present in the `dataset`
table. Visit our
[Datasets repository](https://frosch.cosy.sbg.ac.at/mpawlik/ted-datasets)
to learn how we obtain datasets. Use the `--service service` option of the
`statistics/statistics.py` script to register a dataset in the `dataset` table.
## Executing
We use [Python3](https://www.python.org/) to execute the experiments.
### TED Join
The script `src/join_algs/join_algs_experiments.py` executes tree similarity
join experiments.
It uses a config JSON file to specify the experiment parameters. Example config
files can be found in `configs/icde2019` directory.
Example experiment execution can be performed as follows.
```bash
python3 src/join_algs/join_algs_experiments.py --config configs/icde2019/bolzano.json --dataset_path /path_to/ted-datasets/ --service service
```
### TED Algorithms
The script `src/ted_algs/ted_algs_experiments.py` executes tree similarity
join experiments.
It uses a config JSON file to specify the experiment parameters. Example config
files can be found in `configs/icde2019/upperbound` directory.
Example experiment execution can be performed as follows.
```bash
python3 src/join_algs/ted_algs_experiments.py --config configs/icde2019/upperbound/sentiment.json --dataset_path /path_to/ted-datasets/ --service service
```
{
"datasets": [
"bolzano/bolzano_sorted.bracket"
],
"thresholds": [
1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0,
9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0,
16.0, 17.0, 18.0, 19.0, 20.0
],
"algorithms": [
{ "name": "t_join",
"verification_algorithm" : "Touzet",
"upperbound": "greedy"
},
{ "name": "histogram_join",
"verification_algorithm" : "APTED",
"upperbound": "none"
},
{ "name": "binary_branches_join",
"verification_algorithm" : "APTED",
"upperbound": "none"
},
{ "name": "guha_rsb_join",
"verification_algorithm" : "APTED",
"reference_set_size": "8",
"reference_set_id": "-1"
},
{ "name": "tang_join",
"verification_algorithm" : "APTED",
"upperbound": "none"
}
]
}
\ No newline at end of file
{
"datasets": [
"dblp/dblp_no_www_sorted.bracket"
],
"thresholds": [
1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0
],
"algorithms": [
{ "name": "t_join",
"verification_algorithm" : "Touzet",
"upperbound": "greedy"
}
]
}
{
"datasets": [
"dblp/dblp_10000_sorted.bracket",
"dblp/dblp_50000_sorted.bracket",
"dblp/dblp_100000_sorted.bracket"
],
"thresholds": [
6.0
],
"algorithms": [
{ "name": "t_join",
"verification_algorithm" : "Touzet",
"upperbound": "greedy"
},
{ "name": "histogram_join",
"verification_algorithm" : "APTED",
"upperbound": "none"
},
{ "name": "binary_branches_join",
"verification_algorithm" : "APTED",
"upperbound": "none"
},
{ "name": "guha_rsb_join",
"verification_algorithm" : "APTED",
"reference_set_size": "8",
"reference_set_id": "-1"
},
{ "name": "tang_join",
"verification_algorithm" : "APTED",
"upperbound": "none"
}
]
}
{
"datasets": [
"python/python_sorted.bracket"
],
"thresholds": [
1.0, 2.0, 5.0, 10.0, 15.0, 20.0
],
"algorithms": [
{ "name": "t_join",
"verification_algorithm" : "Touzet",
"upperbound": "greedy"
}
]
}
{
"datasets": [
"sentiment/sentiment_sorted.bracket"
],
"thresholds": [
1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0,
9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0
],
"algorithms": [
{ "name": "t_join",
"verification_algorithm" : "Touzet",
"upperbound": "greedy"
},
{ "name": "histogram_join",
"verification_algorithm" : "APTED",
"upperbound": "none"
},
{ "name": "binary_branches_join",
"verification_algorithm" : "APTED",
"upperbound": "none"
},
{ "name": "guha_rsb_join",
"verification_algorithm" : "APTED",
"reference_set_size": "8",
"reference_set_id": "-1"
},
{ "name": "tang_join",
"verification_algorithm" : "APTED",
"upperbound": "none"
}
]
}
\ No newline at end of file
{
"datasets": [
"sentiment/sentiment_sorted.bracket"
],
"thresholds": [
10.0, 11.0
],
"algorithms": [
{ "name": "guha_rsb_join",
"verification_algorithm" : "APTED",
"reference_set_size": "2",
"reference_set_id": "4"
},
{ "name": "guha_rsb_join",
"verification_algorithm" : "APTED",
"reference_set_size": "4",
"reference_set_id": "4"
},
{ "name": "guha_rsb_join",
"verification_algorithm" : "APTED",
"reference_set_size": "8",
"reference_set_id": "4"
},
{ "name": "guha_rsb_join",
"verification_algorithm" : "APTED",
"reference_set_size": "16",
"reference_set_id": "4"
},
{ "name": "guha_rsb_join",
"verification_algorithm" : "APTED",
"reference_set_size": "32",
"reference_set_id": "4"
}
]
}
\ No newline at end of file
{
"datasets": [
"sentiment/sentiment_sorted.bracket"
],
"thresholds": [
12.0, 13.0
],
"algorithms": [
{ "name": "guha_rsb_join",
"verification_algorithm" : "APTED",
"reference_set_size": "2",
"reference_set_id": "5"
},
{ "name": "guha_rsb_join",
"verification_algorithm" : "APTED",
"reference_set_size": "4",
"reference_set_id": "5"
},
{ "name": "guha_rsb_join",
"verification_algorithm" : "APTED",
"reference_set_size": "8",
"reference_set_id": "5"
},
{ "name": "guha_rsb_join",
"verification_algorithm" : "APTED",
"reference_set_size": "16",
"reference_set_id": "5"
},
{ "name": "guha_rsb_join",
"verification_algorithm" : "APTED",
"reference_set_size": "32",
"reference_set_id": "5"
}
]
}
\ No newline at end of file
{
"datasets": [
"sentiment/sentiment_sorted.bracket"
],
"thresholds": [
14.0, 15.0
],
"algorithms": [
{ "name": "guha_rsb_join",
"verification_algorithm" : "APTED",
"reference_set_size": "2",
"reference_set_id": "6"
},
{ "name": "guha_rsb_join",
"verification_algorithm" : "APTED",
"reference_set_size": "4",
"reference_set_id": "6"
},
{ "name": "guha_rsb_join",
"verification_algorithm" : "APTED",
"reference_set_size": "8",
"reference_set_id": "6"
},
{ "name": "guha_rsb_join",
"verification_algorithm" : "APTED",
"reference_set_size": "16",
"reference_set_id": "6"
},
{ "name": "guha_rsb_join",
"verification_algorithm" : "APTED",
"reference_set_size": "32",
"reference_set_id": "6"
}
]
}
\ No newline at end of file
{
"datasets": [
"sentiment/sentiment_sorted.bracket"
],
"thresholds": [
1.0, 2.0, 3.0
],
"algorithms": [
{ "name": "guha_rsb_join",
"verification_algorithm" : "APTED",
"reference_set_size": "2",
"reference_set_id": "0"
},
{ "name": "guha_rsb_join",
"verification_algorithm" : "APTED",
"reference_set_size": "4",
"reference_set_id": "0"
},
{ "name": "guha_rsb_join",
"verification_algorithm" : "APTED",
"reference_set_size": "8",
"reference_set_id": "0"
},
{ "name": "guha_rsb_join",
"verification_algorithm" : "APTED",
"reference_set_size": "16",
"reference_set_id": "0"
},
{ "name": "guha_rsb_join",
"verification_algorithm" : "APTED",
"reference_set_size": "32",
"reference_set_id": "0"
}
]
}
\ No newline at end of file
{
"datasets": [
"sentiment/sentiment_sorted.bracket"
],
"thresholds": [
4.0, 5.0
],
"algorithms": [
{ "name": "guha_rsb_join",
"verification_algorithm" : "APTED",
"reference_set_size": "2",
"reference_set_id": "1"
},
{ "name": "guha_rsb_join",
"verification_algorithm" : "APTED",
"reference_set_size": "4",
"reference_set_id": "1"
},
{ "name": "guha_rsb_join",
"verification_algorithm" : "APTED",
"reference_set_size": "8",
"reference_set_id": "1"
},
{ "name": "guha_rsb_join",
"verification_algorithm" : "APTED",
"reference_set_size": "16",
"reference_set_id": "1"
},
{ "name": "guha_rsb_join",
"verification_algorithm" : "APTED",
"reference_set_size": "32",
"reference_set_id": "1"
}
]
}
\ No newline at end of file
{
"datasets": [
"sentiment/sentiment_sorted.bracket"
],
"thresholds": [
6.0, 7.0
],
"algorithms": [
{ "name": "guha_rsb_join",
"verification_algorithm" : "APTED",
"reference_set_size": "2",
"reference_set_id": "2"
},
{ "name": "guha_rsb_join",
"verification_algorithm" : "APTED",
"reference_set_size": "4",
"reference_set_id": "2"
},
{ "name": "guha_rsb_join",
"verification_algorithm" : "APTED",
"reference_set_size": "8",
"reference_set_id": "2"
},
{ "name": "guha_rsb_join",
"verification_algorithm" : "APTED",
"reference_set_size": "16",
"reference_set_id": "2"
},
{ "name": "guha_rsb_join",
"verification_algorithm" : "APTED",
"reference_set_size": "32",
"reference_set_id": "2"
}
]
}
\ No newline at end of file
{
"datasets": [
"sentiment/sentiment_sorted.bracket"
],
"thresholds": [
8.0, 9.0
],
"algorithms": [
{ "name": "guha_rsb_join",
"verification_algorithm" : "APTED",
"reference_set_size": "2",
"reference_set_id": "3"
},
{ "name": "guha_rsb_join",
"verification_algorithm" : "APTED",
"reference_set_size": "4",
"reference_set_id": "3"
},
{ "name": "guha_rsb_join",
"verification_algorithm" : "APTED",
"reference_set_size": "8",
"reference_set_id": "3"
},
{ "name": "guha_rsb_join",
"verification_algorithm" : "APTED",
"reference_set_size": "16",
"reference_set_id": "3"
},
{ "name": "guha_rsb_join",
"verification_algorithm" : "APTED",
"reference_set_size": "32",
"reference_set_id": "3"
}
]
}
\ No newline at end of file
{
"datasets": [
"sentiment/sentiment_sorted.bracket"
],
"thresholds": [
10.0, 11.0
],
"algorithms": [
{ "name": "guha_rsc_join",
"verification_algorithm" : "APTED",
"reference_set_size": "2",
"reference_set_id": "4"
},
{ "name": "guha_rsc_join",
"verification_algorithm" : "APTED",
"reference_set_size": "4",
"reference_set_id": "4"
},
{ "name": "guha_rsc_join",
"verification_algorithm" : "APTED",
"reference_set_size": "8",
"reference_set_id": "4"
},
{ "name": "guha_rsc_join",
"verification_algorithm" : "APTED",
"reference_set_size": "16",
"reference_set_id": "4"
},
{ "name": "guha_rsc_join",
"verification_algorithm" : "APTED",
"reference_set_size": "32",
"reference_set_id": "4"
}
]
}
\ No newline at end of file
{
"datasets": [
"sentiment/sentiment_sorted.bracket"
],
"thresholds": [
12.0, 13.0
],
"algorithms": [
{ "name": "guha_rsc_join",
"verification_algorithm" : "APTED",
"reference_set_size": "2",
"reference_set_id": "5"
},
{ "name": "guha_rsc_join",
"verification_algorithm" : "APTED",
"reference_set_size": "4",
"reference_set_id": "5"
},
{ "name": "guha_rsc_join",
"verification_algorithm" : "APTED",
"reference_set_size": "8",
"reference_set_id": "5"
},
{ "name": "guha_rsc_join",
"verification_algorithm" : "APTED",