Commit 8ef9c151 authored by Mateusz Pawlik's avatar Mateusz Pawlik
Browse files

Added Guha to experiments.

parent 78023e35
......@@ -71,6 +71,8 @@ target_include_directories(
external/tree-similarity/src/parser
external/tree-similarity/src/ted
external/tree-similarity/src/ted_ub
external/tree-similarity/src/ted_lb
external/tree-similarity/src/join/guha
)
# Let the compiler know to find the header files in TreeSimilarity library.
......
......@@ -250,6 +250,31 @@ CREATE TABLE histogram_join (
upperbound_pruned bigint -- Number of pairs in the result set by upperbound computation.
);
DROP TABLE IF EXISTS guha_join;
CREATE TABLE guha_join (
-- Common attributes.
execution_id serial PRIMARY KEY,
experiments_version varchar(127),
experiments_timestamp timestamp,
hostname varchar(127),
dataset_filename varchar(127) REFERENCES dataset(filename),
dataset_parsing_time bigint,
algorithm_version varchar(127),
threshold decimal,
join_result_size bigint,
-- Algorithm-specific attributes.
verification_algorithm varchar(31),
vectors_time bigint,
candidates_time bigint,
ted_verification_candidates bigint,
verification_time bigint, -- TED verification time.
l_t_candidates bigint,
sed_candidates bigint,
u_t_result_pairs bigint,
cted_result_pairs bigint,
reference_set_size int
);
-- Parameters of a ted experiment (for normalization):
-- ted_experiment_timestamp timestamp,
......
......@@ -968,6 +968,81 @@ void execute_tang_join(std::vector<node::Node<Label>>& trees_collection,
std::cout << "\"optimum_time\": " << optimum->getfloat() << "}" << std::endl;
}
template <typename Label, typename CostModel, typename VerificationAlgorithm>
void execute_guha_join(std::vector<node::Node<Label>>& trees_collection,
double distance_threshold, unsigned int reference_set_size) {
// Initialize join algorithm
join::Guha<Label, CostModel, VerificationAlgorithm> guha_join;
Timing timing;
std::vector<join::JoinResultElement> join_result;
// Add some scopes to ensure that the memory is deallocated
{
// Initialized Timing object
Timing::Interval * tree_to_set = timing.create_enroll("Vectors");
// Start timing
tree_to_set->start();
// Get a random reference set.
std::vector<unsigned int> reference_set = guha_join.get_random_reference_set(
trees_collection, reference_set_size
);
// Initialize vectors.
std::vector<std::vector<double>> ted_vectors(trees_collection.size(), std::vector<double>(reference_set.size()));
// Compute the vectors.
guha_join.compute_vectors(trees_collection, reference_set, ted_vectors);
// Stop timing
tree_to_set->stop();
// Write timing
std::cout << "\"vectors_time\": " << tree_to_set->getfloat() << ", ";
{
// Initialized Timing object
Timing::Interval * retCand = timing.create_enroll("RetrieveCandidates");
// Start timing
retCand->start();
// Retrieve candidates for tjoin's candidate index
std::vector<std::pair<unsigned int, unsigned int>> join_candidates;
guha_join.retrieve_candidates(trees_collection, join_candidates, join_result, distance_threshold, reference_set, ted_vectors);
// Stop timing
retCand->stop();
// Write timing
std::cout << "\"candidates_time\": " << retCand->getfloat() << ", ";
std::cout << "\"ted_verification_candidates\": " << join_candidates.size() << ", ";
std::cout << "\"l_t_candidates\": " << guha_join.get_l_t_candidates() << ", ";
std::cout << "\"sed_candidates\": " << guha_join.get_sed_candidates() << ", ";
std::cout << "\"u_t_result_pairs\": " << guha_join.get_u_t_result_pairs() << ", ";
std::cout << "\"cted_result_pairs\": " << guha_join.get_cted_result_pairs() << ", ";
// Initialized Timing object
Timing::Interval * verify = timing.create_enroll("Verify");
// Start timing
verify->start();
// Verify all computed join candidates and return the join result
guha_join.verify_candidates(trees_collection, join_candidates, join_result, distance_threshold, ted_vectors);
// Stop timing
verify->stop();
// Write timing
std::cout << "\"verification_time\": " << verify->getfloat() << ", ";
}
std::cout << "\"join_result_size\": " << join_result.size() << "}" << std::endl;
}
// Calculate optimum by verify only the resultset
// Initialized Timing object
Timing::Interval * optimum = timing.create_enroll("Optimum");
// Start timing
optimum->start();
}
int main(int argc, char** argv) {
using Label = label::StringLabel;
using CostModel = cost_model::UnitCostModel<Label>;
......@@ -1079,6 +1154,15 @@ int main(int argc, char** argv) {
} else if (argv[4] == std::string("APTED")) {
execute_histogram_join<Label, CostModel, APTED>(trees_collection, upperbound, distance_threshold);
}
} else if (argv[3] == std::string("guha_join")) {
unsigned int reference_set_size = std::stoi(argv[7]);
if (argv[4] == std::string("ZhangShasha")) {
execute_guha_join<Label, CostModel, ZhangShasha>(trees_collection, distance_threshold, reference_set_size);
} else if (argv[4] == std::string("Touzet")) {
execute_guha_join<Label, CostModel, Touzet>(trees_collection, distance_threshold, reference_set_size);
} else if (argv[4] == std::string("APTED")) {
execute_guha_join<Label, CostModel, APTED>(trees_collection, distance_threshold, reference_set_size);
}
}
return 0;
......
......@@ -52,5 +52,6 @@
#include "touzet.h"
#include "greedy_ub.h"
#include "apted.h"
#include "guha.h"
#endif // JOIN_ALGS_EXPERIMENTS_H
......@@ -139,6 +139,12 @@ def main():
"verification_algorithm" : a['verification_algorithm']
}
cmd.extend((binary_name, d, str(t), a['name'], a['verification_algorithm'], "", a['upperbound']))
elif a['name'] == 'guha_join':
algorithm_params = {
"verification_algorithm" : a['verification_algorithm'],
"reference_set_size" : a['reference_set_size']
}
cmd.extend((binary_name, d, str(t), a['name'], a['verification_algorithm'], "", "", a['reference_set_size']))
cmd_output = get_stdout_cmd(cmd).strip()
result_data = json.loads(cmd_output.decode('utf-8'))
result_data.update(fixed_values)
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment