Commit de6ce32c authored by Thomas Huetter's avatar Thomas Huetter

added combined histogram algorithm

parent d581c4b4
......@@ -65,6 +65,7 @@ target_include_directories(
external/tree-similarity/src/join/label_histogram
external/tree-similarity/src/join/degree_histogram
external/tree-similarity/src/join/leaf_dist_histogram
external/tree-similarity/src/join/histogram
external/tree-similarity/src/label
external/tree-similarity/src/node
external/tree-similarity/src/parser
......
{
"datasets": [
"bolzano/bolzano_sorted.bracket"
],
"thresholds": [
1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0,
11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0
],
"algorithms": [
{ "name": "t_join",
"verification_algorithm" : "Touzet",
"upperbound": "greedy"
},
{ "name": "histogram_join",
"verification_algorithm" : "APTED",
"upperbound": "none"
},
{ "name": "tang_join",
"verification_algorithm" : "APTED",
"upperbound": "none"
},
{ "name": "binary_branches_join",
"verification_algorithm" : "APTED",
"upperbound": "none"
}
]
}
\ No newline at end of file
{
"datasets": [
"dblp/dblp_10000_sorted.bracket"
],
"thresholds": [
1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0
],
"algorithms": [
{ "name": "t_join",
"verification_algorithm" : "Touzet",
"upperbound": "greedy"
},
{ "name": "histogram_join",
"verification_algorithm" : "APTED",
"upperbound": "none"
},
{ "name": "tang_join",
"verification_algorithm" : "APTED",
"upperbound": "none"
},
{ "name": "binary_branches_join",
"verification_algorithm" : "APTED",
"upperbound": "none"
}
]
}
\ No newline at end of file
{
"datasets": [
"python/python_10000_sorted.bracket"
],
"thresholds": [
1.0, 2.0, 5.0, 10.0, 15.0, 20.0
],
"algorithms": [
{ "name": "t_join",
"verification_algorithm" : "Touzet",
"upperbound": "greedy"
},
{ "name": "histogram_join",
"verification_algorithm" : "APTED",
"upperbound": "none"
},
{ "name": "tang_join",
"verification_algorithm" : "APTED",
"upperbound": "none"
},
{ "name": "binary_branches_join",
"verification_algorithm" : "APTED",
"upperbound": "none"
}
]
}
\ No newline at end of file
{
"datasets": [
"sentiment/sentiment_sorted.bracket"
],
"thresholds": [
1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0,
11.0, 12.0, 13.0, 14.0, 15.0
],
"algorithms": [
{ "name": "t_join",
"verification_algorithm" : "Touzet",
"upperbound": "greedy"
},
{ "name": "histogram_join",
"verification_algorithm" : "APTED",
"upperbound": "none"
},
{ "name": "tang_join",
"verification_algorithm" : "APTED",
"upperbound": "none"
},
{ "name": "binary_branches_join",
"verification_algorithm" : "APTED",
"upperbound": "none"
}
]
}
\ No newline at end of file
{
"datasets": [
"swissprot/swissprot_10000_sorted.bracket"
],
"thresholds": [
1.0, 5.0, 10.0, 15.0, 20.0, 25.0, 30.0,
35.0, 40.0, 45.0, 50.0, 55.0, 60.0
],
"algorithms": [
{ "name": "t_join",
"verification_algorithm" : "Touzet",
"upperbound": "greedy"
},
{ "name": "histogram_join",
"verification_algorithm" : "APTED",
"upperbound": "none"
},
{ "name": "tang_join",
"verification_algorithm" : "APTED",
"upperbound": "none"
},
{ "name": "binary_branches_join",
"verification_algorithm" : "APTED",
"upperbound": "none"
}
]
}
\ No newline at end of file
......@@ -222,6 +222,34 @@ CREATE TABLE degree_histogram_join (
upperbound_pruned bigint -- Number of pairs in the result set by upperbound computation.
);
DROP TABLE IF EXISTS histogram_join;
CREATE TABLE histogram_join (
-- Common attributes.
execution_id serial PRIMARY KEY,
experiments_version varchar(127),
experiments_timestamp timestamp,
hostname varchar(127),
dataset_filename varchar(127) REFERENCES dataset(filename),
dataset_parsing_time bigint,
algorithm_version varchar(127),
threshold decimal,
sum_subproblems bigint,
join_result_size bigint,
sum_subproblem_optimum bigint,
optimum_time bigint,
-- Algorithm-specific attributes.
verification_algorithm varchar(31),
tree_to_set_time bigint,
inv_list_lookups bigint, -- Pairs of trees that the index looks at.
index_verification_candidates bigint, -- Pairs of trees resulting from index only.
index_time bigint, -- Total time of the index and its verification step.
verification_candidates bigint, -- Pairs of trees resulting after verification step of the index.
verification_time bigint, -- TED verification time.
upperbound varchar(127), -- string that defines which upperbound is used
upperbound_time bigint, -- Time spent to compute upperbounds for verification candidates.
upperbound_pruned bigint -- Number of pairs in the result set by upperbound computation.
);
-- Parameters of a ted experiment (for normalization):
-- ted_experiment_timestamp timestamp,
......
......@@ -710,6 +710,137 @@ void execute_lh_join(std::vector<node::Node<Label>>& trees_collection, std::stri
std::cout << "\"optimum_time\": " << optimum->getfloat() << "}" << std::endl;
}
template <typename Label, typename CostModel, typename VerificationAlgorithm>
void execute_histogram_join(std::vector<node::Node<Label>>& trees_collection, std::string upperbound,
double distance_threshold) {
// Initialize join algorithm
join::HJoin<Label, CostModel, VerificationAlgorithm> hj;
Timing timing;
std::vector<join::JoinResultElement> join_result;
// Add some scopes to ensure that the memory is deallocated
{
// Initialized Timing object
Timing::Interval * tree_to_set = timing.create_enroll("TreeToSet");
// Start timing
tree_to_set->start();
// Convert trees to histogram of label values.
std::vector<std::pair<unsigned int, std::unordered_map<unsigned int, unsigned int>>> label_histogram_collection;
std::vector<std::pair<unsigned int, std::unordered_map<unsigned int, unsigned int>>> degree_histogram_collection;
std::vector<std::pair<unsigned int, std::unordered_map<unsigned int, unsigned int>>> leaf_distance_histogram_collection;
hj.convert_trees_to_sets(trees_collection, label_histogram_collection,
degree_histogram_collection, leaf_distance_histogram_collection);
// Stop timing
tree_to_set->stop();
// Write timing
std::cout << "\"tree_to_set_time\": " << tree_to_set->getfloat() << ", ";
{
// Initialized Timing object
Timing::Interval * retCand = timing.create_enroll("RetrieveCandidates");
// Start timing
retCand->start();
// Retrieve candidates for tjoin's candidate index
std::vector<std::pair<unsigned int, unsigned int>> join_candidates;
hj.retrieve_candidates(label_histogram_collection, degree_histogram_collection,
leaf_distance_histogram_collection, join_candidates, distance_threshold);
// Stop timing
retCand->stop();
// Write timing
std::cout << "\"index_time\": " << retCand->getfloat() << ", ";
std::cout << "\"verification_candidates\": " << join_candidates.size() << ", ";
if(upperbound == "greedy") {
ted_ub::GreedyUB<Label, CostModel> gub;
// Initialized Timing object
Timing::Interval * greedyub = timing.create_enroll("GreedyUB");
// Start timing
greedyub->start();
std::vector<std::pair<unsigned int, unsigned int>>::iterator it = join_candidates.begin();
while(it != join_candidates.end()) {
double ub_value = gub.verify(trees_collection[it->first],
trees_collection[it->second],
distance_threshold);
if(ub_value <= distance_threshold) {
join_result.emplace_back(it->first, it->second, ub_value);
*it = join_candidates.back();
join_candidates.pop_back();
}
else {
++it;
}
}
// Stop timing
greedyub->stop();
// Write timing
std::cout << "\"upperbound_time\": " << greedyub->getfloat() << ", ";
std::cout << "\"upperbound_pruned\": " << join_result.size() << ", ";
} else {
std::cout << "\"upperbound_time\": 0" << ", ";
std::cout << "\"upperbound_pruned\": 0" << ", ";
}
// Initialized Timing object
Timing::Interval * verify = timing.create_enroll("Verify");
// Start timing
verify->start();
// Verify all computed join candidates and return the join result
hj.verify_candidates(trees_collection, join_candidates,
join_result, distance_threshold);
// Stop timing
verify->stop();
// Write timing
std::cout << "\"verification_time\": " << verify->getfloat() << ", ";
}
// Write number of candidates and number of result pairs
std::cout << "\"index_verification_candidates\": " << hj.get_number_of_pre_candidates() << ", ";
std::cout << "\"inv_list_lookups\": " << hj.get_number_of_il_lookups() << ", ";
std::cout << "\"sum_subproblems\": " << hj.get_subproblem_count() << ", ";
std::cout << "\"join_result_size\": " << join_result.size() << ", ";
}
// Calculate optimum by verify only the resultset
// Initialized Timing object
Timing::Interval * optimum = timing.create_enroll("Optimum");
// Start timing
optimum->start();
VerificationAlgorithm ted_algorithm;
std::vector<join::JoinResultElement> optimum_result;
unsigned long long int sum_subproblem_optimum = 0;
for(auto pair: join_result) {
double ted_value = ted_algorithm.verify(trees_collection[pair.tree_id_1],
trees_collection[pair.tree_id_2],
distance_threshold);
if(ted_value <= distance_threshold)
optimum_result.emplace_back(pair.tree_id_1, pair.tree_id_2, ted_value);
// Sum up all number of subproblems
sum_subproblem_optimum += ted_algorithm.get_subproblem_count();
}
// Stop timing
optimum->stop();
// Write timing
std::cout << "\"sum_subproblem_optimum\": " << sum_subproblem_optimum << ", ";
std::cout << "\"optimum_time\": " << optimum->getfloat() << "}" << std::endl;
}
template <typename Label, typename CostModel, typename VerificationAlgorithm>
void execute_tang_join(std::vector<node::Node<Label>>& trees_collection,
std::string upperbound, double distance_threshold) {
......@@ -940,6 +1071,14 @@ int main(int argc, char** argv) {
} else if (argv[4] == std::string("APTED")) {
execute_bb_join<Label, CostModel, APTED>(trees_collection, upperbound, distance_threshold);
}
} else if(argv[3] == std::string("histogram_join")) {
if (argv[4] == std::string("ZhangShasha")) {
execute_histogram_join<Label, CostModel, ZhangShasha>(trees_collection, upperbound, distance_threshold);
} else if (argv[4] == std::string("Touzet")) {
execute_histogram_join<Label, CostModel, Touzet>(trees_collection, upperbound, distance_threshold);
} else if (argv[4] == std::string("APTED")) {
execute_histogram_join<Label, CostModel, APTED>(trees_collection, upperbound, distance_threshold);
}
}
return 0;
......
......@@ -44,6 +44,7 @@
#include "dh_join.h"
#include "ldh_join.h"
#include "bb_join.h"
#include "histo_join.h"
#include "unit_cost_model.h"
#include "label_set_converter.h"
#include "binary_tree_converter.h"
......
......@@ -128,7 +128,7 @@ def main():
# build command that needs to be executed
cmd = []
# call binary
if a['name'] == 't_join' or a['name'] == 'tang_join' or a['name'] == 'leaf_distance_histogram_join' or a['name'] == 'label_histogram_join' or a['name'] == 'degree_histogram_join' or a['name'] == 'binary_branches_join':
if a['name'] == 't_join' or a['name'] == 'tang_join' or a['name'] == 'leaf_distance_histogram_join' or a['name'] == 'label_histogram_join' or a['name'] == 'degree_histogram_join' or a['name'] == 'binary_branches_join' or a['name'] == 'histogram_join':
algorithm_params = {
"verification_algorithm" : a['verification_algorithm'],
"upperbound" : a['upperbound']
......
### FPR
python3 plot_experiments.py --service ted-exp --config configs_revision/fpr/fpr_bolzano.json --storeplot "./plots/fpr/bolzano_fpr.pdf"
python3 plot_experiments.py --service ted-exp --config configs_revision/fpr/fpr_dblp_10000.json --storeplot "./plots/fpr/dblp_fpr_10000.pdf"
python3 plot_experiments.py --service ted-exp --config configs_revision/fpr/fpr_python_10000.json --storeplot "./plots/fpr/python_fpr_10000.pdf"
python3 plot_experiments.py --service ted-exp --config configs_revision/fpr/fpr_sentiment.json --storeplot "./plots/fpr/sentiment_fpr.pdf"
python3 plot_experiments.py --service ted-exp --config configs_revision/fpr/fpr_swissprot_10000.json --storeplot "./plots/fpr/swissprot_fpr_10000.pdf"
### Join Times
python3 plot_experiments.py --service ted-exp --config configs_revision/times/times_bolzano.json --storeplot "./plots/times/bolzano_times.pdf"
python3 plot_experiments.py --service ted-exp --config configs_revision/times/times_dblp_10000.json --storeplot "./plots/times/dblp_times_10000.pdf"
python3 plot_experiments.py --service ted-exp --config configs_revision/times/times_python_10000.json --storeplot "./plots/times/python_times_10000.pdf"
python3 plot_experiments.py --service ted-exp --config configs_revision/times/times_sentiment.json --storeplot "./plots/times/sentiment_times.pdf"
python3 plot_experiments.py --service ted-exp --config configs_revision/times/times_swissprot_10000.json --storeplot "./plots/times/swissprot_times_10000.pdf"
### Candidate Times
python3 plot_experiments.py --service ted-exp --config configs_revision/cand_times/cand_times_bolzano.json --storeplot "./plots/cand_times/bolzano_cand_times.pdf"
python3 plot_experiments.py --service ted-exp --config configs_revision/cand_times/cand_times_dblp_10000.json --storeplot "./plots/cand_times/dblp_cand_times_10000.pdf"
python3 plot_experiments.py --service ted-exp --config configs_revision/cand_times/cand_times_python_10000.json --storeplot "./plots/cand_times/python_cand_times_10000.pdf"
python3 plot_experiments.py --service ted-exp --config configs_revision/cand_times/cand_times_sentiment.json --storeplot "./plots/cand_times/sentiment_cand_times.pdf"
python3 plot_experiments.py --service ted-exp --config configs_revision/cand_times/cand_times_swissprot_10000.json --storeplot "./plots/cand_times/swissprot_cand_times_10000.pdf"
### Candidate Times
python3 plot_experiments.py --service ted-exp --config configs_revision/cand_times_conv/cand_times_conv_bolzano.json --storeplot "./plots/cand_times_conv/bolzano_cand_conv_times.pdf"
python3 plot_experiments.py --service ted-exp --config configs_revision/cand_times_conv/cand_times_conv_dblp_10000.json --storeplot "./plots/cand_times_conv/dblp_cand_conv_times_10000.pdf"
python3 plot_experiments.py --service ted-exp --config configs_revision/cand_times_conv/cand_times_conv_python_10000.json --storeplot "./plots/cand_times_conv/python_cand_conv_times_10000.pdf"
python3 plot_experiments.py --service ted-exp --config configs_revision/cand_times_conv/cand_times_conv_sentiment.json --storeplot "./plots/cand_times_conv/sentiment_cand_conv_times.pdf"
python3 plot_experiments.py --service ted-exp --config configs_revision/cand_times_conv/cand_times_conv_swissprot_10000.json --storeplot "./plots/cand_times_conv/swissprot_cand_conv_times_10000.pdf"
### Candidates
python3 plot_experiments.py --service ted-exp --config configs_revision/candidates/candidates_bolzano.json --storeplot "./plots/candidates/bolzano_candidates.pdf"
python3 plot_experiments.py --service ted-exp --config configs_revision/candidates/candidates_dblp_10000.json --storeplot "./plots/candidates/dblp_candidates_10000.pdf"
python3 plot_experiments.py --service ted-exp --config configs_revision/candidates/candidates_python_10000.json --storeplot "./plots/candidates/python_candidates_10000.pdf"
python3 plot_experiments.py --service ted-exp --config configs_revision/candidates/candidates_sentiment.json --storeplot "./plots/candidates/sentiment_candidates.pdf"
python3 plot_experiments.py --service ted-exp --config configs_revision/candidates/candidates_swissprot_10000.json --storeplot "./plots/candidates/swissprot_candidates_10000.pdf"
\ No newline at end of file
{
"title": "Times",
"print_title": "no",
"legend_font_size": 18,
"legend_frame_alpha": 0.8,
"legend": "upper left",
"grid": "on",
"dataset_name": "Bolzano",
"markers": ["s", "*", "o", "^", "p", "d"],
"markersize": [15, 20, 15, 15, 15, 15],
"markerfacecolor": "none",
"colors": ["red", "green", "blue", "cyan", "magenta", "orange"],
"tables": [
{
"table_name": "tang_join",
"attributes": [
{
"attr_name": "avg(index_time)/1000"
}
],
"name": "$\\mathtt{Tang}$",
"constraints": {
"verification_algorithm": "APTED"
}
},
{
"table_name": "t_join",
"attributes": [
{
"attr_name": "avg(tree_to_set_time + index_time)/1000"
}
],
"name": "$\\mathtt{TJoin}$",
"constraints": {
"verification_algorithm": "Touzet",
"upperbound": "greedy"
}
},
{
"table_name": "binary_branches_join",
"attributes": [
{
"attr_name": "avg(index_time)/1000"
}
],
"name": "$\\mathtt{BinBranches}$",
"constraints": {
"verification_algorithm": "APTED"
}
},
{
"table_name": "histogram_join",
"attributes": [
{
"attr_name": "avg(index_time)/1000"
}
],
"name": "$\\mathtt{Histogram}$",
"constraints": {
"verification_algorithm": "APTED"
}
}
],
"constraints": {
"dataset_filename": "bolzano_sorted.bracket"
},
"x_axis": {
"db_column": "threshold",
"name": "Threshold",
"xticks": 2,
"xmin": 1,
"xmax": 15.001,
"font_size": 20,
"ticks_font_size": 20
},
"y_axis": {
"name": "Time [s]",
"scale": "log",
"font_size": 20,
"ticks_font_size": 20
}
}
\ No newline at end of file
{
"title": "Times",
"print_title": "no",
"legend_font_size": 18,
"legend_frame_alpha": 0.8,
"legend": "upper left",
"grid": "on",
"dataset_name": "DBLP 10000",
"markers": ["s", "*", "o", "^", "p", "d"],
"markersize": [15, 20, 15, 15, 15, 15],
"markerfacecolor": "none",
"colors": ["red", "green", "blue", "cyan", "magenta", "orange"],
"tables": [
{
"table_name": "tang_join",
"attributes": [
{
"attr_name": "avg(index_time)/1000"
}
],
"name": "$\\mathtt{Tang}$",
"constraints": {
"verification_algorithm": "APTED"
}
},
{
"table_name": "t_join",
"attributes": [
{
"attr_name": "avg(tree_to_set_time + index_time)/1000"
}
],
"name": "$\\mathtt{TJoin}$",
"constraints": {
"verification_algorithm": "Touzet",
"upperbound": "greedy"
}
},
{
"table_name": "binary_branches_join",
"attributes": [
{
"attr_name": "avg(index_time)/1000"
}
],
"name": "$\\mathtt{BinBranches}$",
"constraints": {
"verification_algorithm": "APTED"
}
},
{
"table_name": "histogram_join",
"attributes": [
{
"attr_name": "avg(index_time)/1000"
}
],
"name": "$\\mathtt{Histogram}$",
"constraints": {
"verification_algorithm": "APTED"
}
}
],
"constraints": {
"dataset_filename": "dblp_10000_sorted.bracket"
},
"x_axis": {
"db_column": "threshold",
"name": "Threshold",
"xticks": 1,
"xmin": 1,
"xmax": 8.001,
"font_size": 20,
"ticks_font_size": 20
},
"y_axis": {
"name": "Time [s]",
"scale": "log",
"font_size": 20,
"ticks_font_size": 20
}
}
\ No newline at end of file
{
"title": "Times",
"print_title": "no",
"legend_font_size": 18,
"legend_frame_alpha": 0.8,
"legend": "lower right",
"grid": "on",
"dataset_name": "Python AST 10000",
"markers": ["s", "*", "o", "^", "p", "d"],
"markersize": [15, 20, 15, 15, 15, 15],
"markerfacecolor": "none",
"colors": ["red", "green", "blue", "cyan", "magenta", "orange"],
"tables": [
{
"table_name": "tang_join",
"attributes": [
{
"attr_name": "avg(index_time)/1000"
}
],
"name": "$\\mathtt{Tang}$",
"constraints": {
"verification_algorithm": "APTED"
}
},
{
"table_name": "t_join",
"attributes": [
{
"attr_name": "avg(tree_to_set_time + index_time)/1000"
}
],
"name": "$\\mathtt{TJoin}$",
"constraints": {
"verification_algorithm": "Touzet",
"upperbound": "greedy"
}
},
{
"table_name": "binary_branches_join",
"attributes": [
{
"attr_name": "avg(index_time)/1000"
}
],
"name": "$\\mathtt{BinBranches}$",
"constraints": {
"verification_algorithm": "APTED"
}
},
{
"table_name": "histogram_join",
"attributes": [
{
"attr_name": "avg(index_time)/1000"
}
],
"name": "$\\mathtt{Histogram}$",
"constraints": {
"verification_algorithm": "APTED"
}
}
],
"constraints": {
"dataset_filename": "python_10000_sorted.bracket"
},
"x_axis": {
"db_column": "threshold",
"name": "Threshold",
"xticks": 2,
"xmin": 1,
"xmax": 20.001,
"font_size": 20,
"ticks_font_size": 20
},
"y_axis": {
"name": "Time [s]",