Commit 8e5709be authored by Thomas Huetter's avatar Thomas Huetter
Browse files

change to public tree-similarity repository

parent b57f7a50
......@@ -17,8 +17,8 @@ endif()
# Build executable with the experiments.
add_executable(
ted-join-experiments # EXECUTABLE NAME
src/experiments.cc # EXECUTABLE SOURCE
ted-join-experiments # EXECUTABLE NAME
src/join_algs/join_algs_experiments.cc # EXECUTABLE SOURCE
)
target_link_libraries(
ted-join-experiments # TARGET EXECUTABLE NAME
......@@ -55,16 +55,17 @@ add_library(
)
target_include_directories(
TreeSimilarity INTERFACE
external/tree-similarity-private/src/allpairs
external/tree-similarity-private/src/cost_model
external/tree-similarity-private/src/data_structures
external/tree-similarity-private/src/join
external/tree-similarity-private/src/label
external/tree-similarity-private/src/node
external/tree-similarity-private/src/parser
external/tree-similarity-private/src/tree_to_set_converter
external/tree-similarity-private/src/ted
external/tree-similarity-private/src/ted_ub
external/tree-similarity/src/cost_model
external/tree-similarity/src/data_structures
external/tree-similarity/src/join
external/tree-similarity/src/join/tjoin
external/tree-similarity/src/join/tang
external/tree-similarity/src/join/naive
external/tree-similarity/src/label
external/tree-similarity/src/node
external/tree-similarity/src/parser
external/tree-similarity/src/ted
external/tree-similarity/src/ted_ub
)
# Let the compiler know to find the header files in TreeSimilarity library.
......
......@@ -9,7 +9,7 @@ cd external
# Timing library.
git clone git@frosch.cosy.sbg.ac.at:wmann/common-code.git
# Tree Similarity library.
git clone git@frosch.cosy.sbg.ac.at:thuetter/tree-similarity-private.git
git clone -b develop https://github.com/DatabaseGroup/tree-similarity.git
```
Then execute the following from the project's root directory.
......
{
"datasets": [
"bolzano/bolzano_sorted.bracket"
],
"thresholds": [
1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0,
11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0
],
"algorithms": [
{ "name": "t_join",
"verification_algorithm" : "Touzet",
"upperbound": "greedy"
},
{ "name": "tang_join",
"verification_algorithm" : "Touzet",
"upperbound": "none"
}
]
}
\ No newline at end of file
{
"datasets": [
"dblp/dblp_no_www_sorted.bracket"
],
"thresholds": [
1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0
],
"algorithms": [
{ "name": "t_join",
"verification_algorithm" : "Touzet",
"upperbound": "greedy"
}
]
}
\ No newline at end of file
{
"datasets": [
"python/python_sorted.bracket"
],
"thresholds": [
1.0, 2.0, 5.0, 10.0, 15.0, 20.0
],
"algorithms": [
{ "name": "t_join",
"verification_algorithm" : "Touzet",
"upperbound": "greedy"
}
]
}
\ No newline at end of file
{
"datasets": [
"/sentiment/sentiment_sorted.bracket"
],
"thresholds": [
1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0,
11.0, 12.0, 13.0, 14.0, 15.0
],
"algorithms": [
{ "name": "t_join",
"verification_algorithm" : "Touzet",
"upperbound": "greedy"
},
{ "name": "tang_join",
"verification_algorithm" : "Touzet",
"upperbound": "none"
}
]
}
\ No newline at end of file
{
"datasets": [
"swissprot/swissprot_sorted.bracket"
],
"thresholds": [
1.0, 5.0, 10.0, 15.0, 20.0, 25.0, 30.0,
35.0, 40.0, 45.0, 50.0, 55.0, 60.0
],
"algorithms": [
{ "name": "t_join",
"verification_algorithm" : "Touzet",
"upperbound": "greedy"
}
]
}
\ No newline at end of file
{
"datasets": [
"/home/mpawlik/Remote/ted-join-experiments/datasets/L_preprocessed.txt"
],
"thresholds": [
10.0
],
"algorithms": [
"--tzd", "--lg"
]
}
\ No newline at end of file
......@@ -31,11 +31,10 @@ CREATE TABLE dataset (
-- sum_subproblems bigint,
-- result_set_size bigint,
-- sum_subproblem_optimum bigint,
-- optimum_time bigint,
-- inv_list_lookups bigint,
-- optimum_time bigint
DROP TABLE IF EXISTS naive_self_join;
CREATE TABLE naive_self_join (
DROP TABLE IF EXISTS naive_join;
CREATE TABLE naive_join (
-- Common attributes.
execution_id serial PRIMARY KEY,
experiments_version varchar(127),
......@@ -49,15 +48,14 @@ CREATE TABLE naive_self_join (
result_set_size bigint,
sum_subproblem_optimum bigint,
optimum_time bigint,
inv_list_lookups bigint,
-- Algorithm-specific attributes.
verification_algorithm varchar(31),
verification_candidates bigint, -- All pairs of trees that the join looks at and verifies.
verification_time bigint -- Total time of the join.
);
DROP TABLE IF EXISTS allpairs_self_join;
CREATE TABLE allpairs_self_join (
DROP TABLE IF EXISTS t_join;
CREATE TABLE t_join (
-- Common attributes.
execution_id serial PRIMARY KEY,
experiments_version varchar(127),
......@@ -71,23 +69,22 @@ CREATE TABLE allpairs_self_join (
result_set_size bigint,
sum_subproblem_optimum bigint,
optimum_time bigint,
inv_list_lookups bigint,
-- Algorithm-specific attributes.
similarity_function varchar(127),
verification_algorithm varchar(31),
tree_to_set_time bigint,
filter_touched_pairs bigint, -- Pairs of trees that the filter looks at.
filter_verification_candidates bigint, -- Pairs of trees resulting from filter only.
filter_time bigint, -- Total time of the filter and its verification step.
verification_candidates bigint, -- Pairs of trees resulting after verification step of the filter.
inv_list_lookups bigint, -- Pairs of trees that the index looks at.
index_verification_candidates bigint, -- Pairs of trees resulting from index only.
index_time bigint, -- Total time of the index and its verification step.
verification_candidates bigint, -- Pairs of trees resulting after verification step of the index.
verification_time bigint, -- TED verification time.
upperbound varchar(127), -- string that defines which upperbound is used
upperbound_time bigint, -- Time spent to compute upperbounds for verification candidates.
upperbound_pruned bigint -- Number of pairs in the result set by upperbound computation.
);
DROP TABLE IF EXISTS allpairs_multiset_baseline_self_join;
CREATE TABLE allpairs_multiset_baseline_self_join (
DROP TABLE IF EXISTS tang_join;
CREATE TABLE tang_join (
-- Common attributes.
execution_id serial PRIMARY KEY,
experiments_version varchar(127),
......@@ -105,103 +102,124 @@ CREATE TABLE allpairs_multiset_baseline_self_join (
-- Algorithm-specific attributes.
similarity_function varchar(127),
verification_algorithm varchar(31),
tree_to_set_time bigint,
filter_touched_pairs bigint, -- Pairs of trees that the filter looks at.
filter_verification_candidates bigint, -- Pairs of trees resulting from filter only.
filter_time bigint, -- Total time of the filter and its verification step.
verification_candidates bigint, -- Pairs of trees resulting after verification step of the filter.
tree_to_binary_tree_time bigint,
inv_list_lookups bigint, -- Pairs of trees that the index looks at.
index_verification_candidates bigint, -- Pairs of trees resulting from index only.
index_time bigint, -- Total time of the index and its verification step.
verification_candidates bigint, -- Pairs of trees resulting after verification step of the index.
verification_time bigint, -- TED verification time.
upperbound varchar(127), -- string that defines which upperbound is used
upperbound_time bigint, -- Time spent to compute upperbounds for verification candidates.
upperbound_pruned bigint -- Number of pairs in the result set by upperbound computation.
);
DROP TABLE IF EXISTS allpairs_multiset_dsf_self_join;
CREATE TABLE allpairs_multiset_dsf_self_join (
-- Common attributes.
execution_id serial PRIMARY KEY,
experiments_version varchar(127),
experiments_timestamp timestamp,
-- Parameters of a ted experiment (for normalization):
-- ted_experiment_timestamp timestamp,
-- experiments_source_commit varchar(127),
-- algorithm_source_commit varchar(127),
-- hostname varchar(127),
-- dataset_filename varchar(127) REFERENCES dataset(filename)
-- Common algorithm attributes:
--
-- IMPORTANT: Apply to all if modified.
-- execution_id bigserial PRIMARY KEY,
-- ted_experiment_params_id integer REFERENCES ted_experiment_params(ted_experiment_params_id),
-- tree_id_1 integer,
-- tree_id_2 integer,
-- tree_size_1 integer,
-- tree_size_2 integer,
-- ted_value decimal,
-- subproblems bigint,
-- runtime bigint
-- Only TEDk algs:
-- has_ted_mapping boolean, -- if TRUE ted_value >=0; if FALSE ted_value = NULL
-- ted_threshold decimal,
-- There is a tuple in this table for each execution of the experiments binary.
DROP TABLE IF EXISTS ted_experiment_params CASCADE;
CREATE TABLE ted_experiment_params (
ted_experiment_params_id serial PRIMARY KEY,
ted_experiment_timestamp timestamp,
experiments_source_commit varchar(127),
algorithm_source_commit varchar(127),
hostname varchar(127),
dataset_filename varchar(127) REFERENCES dataset(filename),
dataset_parsing_time bigint,
algorithm_version varchar(127),
threshold decimal,
sum_subproblems bigint,
result_set_size bigint,
sum_subproblem_optimum bigint,
optimum_time bigint,
inv_list_lookups bigint,
-- Algorithm-specific attributes.
similarity_function varchar(127),
verification_algorithm varchar(31),
tree_to_set_time bigint,
filter_touched_pairs bigint, -- Pairs of trees that the filter looks at.
filter_verification_candidates bigint, -- Pairs of trees resulting from filter only.
filter_time bigint, -- Total time of the filter and its verification step.
verification_candidates bigint, -- Pairs of trees resulting after verification step of the filter.
verification_time bigint, -- TED verification time.
upperbound varchar(127), -- string that defines which upperbound is used
upperbound_time bigint, -- Time spent to compute upperbounds for verification candidates.
upperbound_pruned bigint -- Number of pairs in the result set by upperbound computation.
dataset_filename varchar(127) REFERENCES dataset(filename)
);
DROP TABLE IF EXISTS allpairs_multiset_two_layer_self_join;
CREATE TABLE allpairs_multiset_two_layer_self_join (
-- Common attributes.
execution_id serial PRIMARY KEY,
experiments_version varchar(127),
experiments_timestamp timestamp,
hostname varchar(127),
dataset_filename varchar(127) REFERENCES dataset(filename),
dataset_parsing_time bigint,
algorithm_version varchar(127),
threshold decimal,
sum_subproblems bigint,
result_set_size bigint,
sum_subproblem_optimum bigint,
optimum_time bigint,
inv_list_lookups bigint,
-- Algorithm-specific attributes.
similarity_function varchar(127),
verification_algorithm varchar(31),
tree_to_set_time bigint,
filter_touched_pairs bigint, -- Pairs of trees that the filter looks at.
filter_verification_candidates bigint, -- Pairs of trees resulting from filter only.
filter_time bigint, -- Total time of the filter and its verification step.
verification_candidates bigint, -- Pairs of trees resulting after verification step of the filter.
verification_time bigint, -- TED verification time.
upperbound varchar(127), -- string that defines which upperbound is used
upperbound_time bigint, -- Time spent to compute upperbounds for verification candidates.
upperbound_pruned bigint -- Number of pairs in the result set by upperbound computation.
DROP TABLE IF EXISTS ted_zhangshasha;
CREATE TABLE ted_zhangshasha (
execution_id bigserial PRIMARY KEY,
ted_experiment_params_id integer NOT NULL REFERENCES ted_experiment_params(ted_experiment_params_id),
tree_id_1 integer,
tree_id_2 integer,
tree_size_1 integer,
tree_size_2 integer,
ted_value decimal,
subproblems bigint,
runtime double precision
);
DROP TABLE IF EXISTS partition_based_self_join;
CREATE TABLE partition_based_self_join (
-- Common attributes.
execution_id serial PRIMARY KEY,
experiments_version varchar(127),
experiments_timestamp timestamp,
hostname varchar(127),
dataset_filename varchar(127) REFERENCES dataset(filename),
dataset_parsing_time bigint,
algorithm_version varchar(127),
threshold decimal,
sum_subproblems bigint,
result_set_size bigint,
sum_subproblem_optimum bigint,
optimum_time bigint,
inv_list_lookups bigint,
-- Algorithm-specific attributes.
similarity_function varchar(127),
verification_algorithm varchar(31),
tree_to_binary_tree_time bigint,
filter_touched_pairs bigint, -- Pairs of trees that the filter looks at.
filter_verification_candidates bigint, -- Pairs of trees resulting from filter only.
filter_time bigint, -- Total time of the filter and its verification step.
verification_candidates bigint, -- Pairs of trees resulting after verification step of the filter.
verification_time bigint, -- TED verification time.
upperbound varchar(127), -- string that defines which upperbound is used
upperbound_time bigint, -- Time spent to compute upperbounds for verification candidates.
upperbound_pruned bigint -- Number of pairs in the result set by upperbound computation.
DROP TABLE IF EXISTS ted_apted;
CREATE TABLE ted_apted (
execution_id bigserial PRIMARY KEY,
ted_experiment_params_id integer NOT NULL REFERENCES ted_experiment_params(ted_experiment_params_id),
tree_id_1 integer,
tree_id_2 integer,
tree_size_1 integer,
tree_size_2 integer,
ted_value decimal,
subproblems bigint,
runtime double precision
);
-- Touzet's algorithm - baseline version without the depth-based pruning
DROP TABLE IF EXISTS tedk_touzet;
CREATE TABLE tedk_touzet (
execution_id bigserial PRIMARY KEY,
ted_experiment_params_id integer NOT NULL REFERENCES ted_experiment_params(ted_experiment_params_id),
tree_id_1 integer,
tree_id_2 integer,
tree_size_1 integer,
tree_size_2 integer,
ted_value decimal,
subproblems bigint,
runtime double precision,
has_ted_mapping boolean, -- if TRUE ted_value >=0; if FALSE ted_value = NULL
ted_threshold decimal
);
-- Touzet's algorithm - with depth-based pruning
DROP TABLE IF EXISTS tedk_touzetd;
CREATE TABLE tedk_touzetd (
execution_id bigserial PRIMARY KEY,
ted_experiment_params_id integer NOT NULL REFERENCES ted_experiment_params(ted_experiment_params_id),
tree_id_1 integer,
tree_id_2 integer,
tree_size_1 integer,
tree_size_2 integer,
ted_value decimal,
subproblems bigint,
runtime double precision,
has_ted_mapping boolean, -- if TRUE ted_value >=0; if FALSE ted_value = NULL
ted_threshold decimal
);
-- Label-guided upper bound
DROP TABLE IF EXISTS tedk_labelguided;
CREATE TABLE tedk_labelguided (
execution_id bigserial PRIMARY KEY,
ted_experiment_params_id integer NOT NULL REFERENCES ted_experiment_params(ted_experiment_params_id),
tree_id_1 integer,
tree_id_2 integer,
tree_size_1 integer,
tree_size_2 integer,
ted_value decimal,
subproblems bigint,
runtime double precision,
ted_threshold decimal
);
\ No newline at end of file
-- Parameters of a ted experiment (for normalization):
-- ted_experiment_timestamp timestamp,
-- experiments_source_commit varchar(127),
-- algorithm_source_commit varchar(127),
-- hostname varchar(127),
-- dataset_filename varchar(127) REFERENCES dataset(filename)
-- Common algorithm attributes:
--
-- IMPORTANT: Apply to all if modified.
-- execution_id bigserial PRIMARY KEY,
-- ted_experiment_params_id integer REFERENCES ted_experiment_params(ted_experiment_params_id),
-- tree_id_1 integer,
-- tree_id_2 integer,
-- tree_size_1 integer,
-- tree_size_2 integer,
-- ted_value decimal,
-- subproblems bigint,
-- runtime bigint
-- Only TEDk algs:
-- has_ted_mapping boolean, -- if TRUE ted_value >=0; if FALSE ted_value = NULL
-- ted_threshold decimal,
-- There is a tuple in this table for each execution of the experiments binary.
DROP TABLE IF EXISTS ted_experiment_params CASCADE;
CREATE TABLE ted_experiment_params (
ted_experiment_params_id serial PRIMARY KEY,
ted_experiment_timestamp timestamp,
experiments_source_commit varchar(127),
algorithm_source_commit varchar(127),
hostname varchar(127),
dataset_filename varchar(127) REFERENCES dataset(filename)
);
DROP TABLE IF EXISTS ted_zhangshasha;
CREATE TABLE ted_zhangshasha (
execution_id bigserial PRIMARY KEY,
ted_experiment_params_id integer NOT NULL REFERENCES ted_experiment_params(ted_experiment_params_id),
tree_id_1 integer,
tree_id_2 integer,
tree_size_1 integer,
tree_size_2 integer,
ted_value decimal,
subproblems bigint,
runtime double precision
);
DROP TABLE IF EXISTS ted_apted;
CREATE TABLE ted_apted (
execution_id bigserial PRIMARY KEY,
ted_experiment_params_id integer NOT NULL REFERENCES ted_experiment_params(ted_experiment_params_id),
tree_id_1 integer,
tree_id_2 integer,
tree_size_1 integer,
tree_size_2 integer,
ted_value decimal,
subproblems bigint,
runtime double precision
);
-- Touzet's algorithm - baseline version without the depth-based pruning
DROP TABLE IF EXISTS tedk_touzet;
CREATE TABLE tedk_touzet (
execution_id bigserial PRIMARY KEY,
ted_experiment_params_id integer NOT NULL REFERENCES ted_experiment_params(ted_experiment_params_id),
tree_id_1 integer,
tree_id_2 integer,
tree_size_1 integer,
tree_size_2 integer,
ted_value decimal,
subproblems bigint,
runtime double precision,
has_ted_mapping boolean, -- if TRUE ted_value >=0; if FALSE ted_value = NULL
ted_threshold decimal
);
-- Touzet's algorithm - with depth-based pruning
DROP TABLE IF EXISTS tedk_touzetd;
CREATE TABLE tedk_touzetd (
execution_id bigserial PRIMARY KEY,
ted_experiment_params_id integer NOT NULL REFERENCES ted_experiment_params(ted_experiment_params_id),
tree_id_1 integer,
tree_id_2 integer,
tree_size_1 integer,
tree_size_2 integer,
ted_value decimal,
subproblems bigint,
runtime double precision,
has_ted_mapping boolean, -- if TRUE ted_value >=0; if FALSE ted_value = NULL
ted_threshold decimal
);
-- Label-guided upper bound
DROP TABLE IF EXISTS tedk_labelguided;
CREATE TABLE tedk_labelguided (
execution_id bigserial PRIMARY KEY,
ted_experiment_params_id integer NOT NULL REFERENCES ted_experiment_params(ted_experiment_params_id),
tree_id_1 integer,
tree_id_2 integer,
tree_size_1 integer,
tree_size_2 integer,
ted_value decimal,
subproblems bigint,
runtime double precision,
ted_threshold decimal
);
\ No newline at end of file
This diff is collapsed.
// The MIT License (MIT)
// Copyright (c) 2017 Thomas Huetter
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
/// \file src/join_algs/join_algs_experiments.cc
///
/// \details
/// Implements an experimental environment that executes algorithms located in
/// the folder external/. Input file, distance threshold and algorithm are
/// passed as commandline arguments.
#include "join_algs_experiments.h"
template <typename Label, typename CostModel, typename VerificationAlgorithm>
void execute_naive_join(std::vector<node::Node<Label>>& trees_collection, double distance_threshold) {
// Initialize join algorithm
join::NaiveJoin<Label, CostModel, VerificationAlgorithm> nsj;
// Initialized Timing object
Timing timing;
Timing::Interval * naive_join = timing.create_enroll("NaiveJoin");
// Start timing
naive_join->start();
// Verify all computed join candidates and return the join result
std::vector<join::JoinResultElement> join_result_nsj =
nsj.execute_join(trees_collection, distance_threshold);
// Stop timing
naive_join->stop();
// Calculate optimum by verify only the resultset
// Initialized Timing object
Timing::Interval * optimum = timing.create_enroll("Optimum");
// Start timing
optimum->start();
VerificationAlgorithm ted_algorithm;
std::vector<join::JoinResultElement> optimum_result;
unsigned long long int sum_subproblem_optimum = 0;
for(auto pair: join_result_nsj) {
double ted_value = ted_algorithm.verify(trees_collection[pair.tree_id_1],
trees_collection[pair.tree_id_2],
distance_threshold);
if(ted_value <= distance_threshold)
optimum_result.emplace_back(pair.tree_id_1, pair.tree_id_2, ted_value);
// Sum up all number of subproblems
sum_subproblem_optimum += ted_algorithm.get_subproblem_count();
}
// Stop timing
optimum->stop();
// Write timing
std::cout << "\"sum_subproblem_optimum\": " << sum_subproblem_optimum << ", ";
std::cout << "\"optimum_time\": " << optimum->getfloat() << ", ";
// Write timing and number of result pairs
int num_trees = trees_collection.size();
std::cout << "\"verification_candidates\" : " << (num_trees*num_trees-num_trees)/2 << ", ";