Commit a0dd0403 authored by Mateusz Pawlik's avatar Mateusz Pawlik
Browse files

Added first working version of a facility to test performance of TED algorithms.

parent de874485
cmake_minimum_required(VERSION 2.8)
# Create the project.
project(ted-join-experiments)
project(ted-experiments)
# Compiler flags.
# MUST be declared after project().
......@@ -20,12 +20,6 @@ add_executable(
ted-join-experiments # EXECUTABLE NAME
src/experiments.cc # EXECUTABLE SOURCE
)
# Link the timing library and header file.
add_library(
TimingLibrary # LIBRARY NAME
external/common-code/timings/timing.cxx # LIBRARY SOURCE
)
target_link_libraries(
ted-join-experiments # TARGET EXECUTABLE NAME
TimingLibrary # LIBRARY NAME
......@@ -35,6 +29,26 @@ target_include_directories(
PUBLIC external/common-code/timings # HEADER FILE
)
# Build executable with the experiments.
add_executable(
ted-algs-experiments # EXECUTABLE NAME
src/ted_algs/ted_algs_experiments.cc # EXECUTABLE SOURCE
)
target_link_libraries(
ted-algs-experiments # TARGET EXECUTABLE NAME
TimingLibrary # LIBRARY NAME
)
target_include_directories(
ted-algs-experiments # TARGET EXECUTABLE NAME
PUBLIC external/common-code/timings # HEADER FILE
)
# Create the timing library.
add_library(
TimingLibrary # LIBRARY NAME
external/common-code/timings/timing.cxx # LIBRARY SOURCE
)
# Create header-only library 'TreeSimilarity' with our algorithms.
add_library(
TreeSimilarity INTERFACE
......@@ -58,3 +72,9 @@ target_link_libraries(
ted-join-experiments # EXECUTABLE NAME
TreeSimilarity # LIBRARY NAME
)
# Let the compiler know to find the header files in TreeSimilarity library.
target_link_libraries(
ted-algs-experiments # EXECUTABLE NAME
TreeSimilarity # LIBRARY NAME
)
// The MIT License (MIT)
// Copyright (c) 2017 Mateusz Pawlik
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
/// \file src/ted-algs/ted_algs_experiments.cc
///
/// \details
/// Implements an experimental environment that executes ted algorithms.
#include "ted_algs_experiments.h"
struct DataItem {
unsigned int tree_id_1;
unsigned int tree_id_2;
unsigned int tree_size_1;
unsigned int tree_size_2;
double ted;
double runtime;
unsigned long long int subproblems;
DataItem() {};
DataItem(unsigned int tid1, unsigned int tid2, unsigned int s1,
unsigned int s2, double ted, double r, unsigned long long int s) :
tree_id_1{tid1},
tree_id_2{tid2},
tree_size_1{s1},
tree_size_2{s1},
ted{ted},
runtime{r},
subproblems{s}
{};
std::string to_json_string() {
std::string output = "{";
output += "\"tree_id_1\" : " + std::to_string(tree_id_1) + ", ";
output += "\"tree_id_2\" : " + std::to_string(tree_id_2) + ", ";
output += "\"tree_size_1\" : " + std::to_string(tree_size_1) + ", ";
output += "\"tree_size_2\" : " + std::to_string(tree_size_2) + ", ";
output += "\"ted\" : " + std::to_string(ted) + ", ";
output += "\"subproblems\" : " + std::to_string(subproblems) + ", ";
output += "\"runtime\" : " + std::to_string(runtime);
output += "}";
return output;
};
std::string to_csv_string() {
std::string output;
output += std::to_string(tree_id_1) + "," + std::to_string(tree_id_2) + "," +
std::to_string(tree_size_1) + "," + std::to_string(tree_size_2) + "," +
std::to_string(ted) + "," + std::to_string(subproblems) + "," +
std::to_string(runtime);
return output;
};
};
struct AlgorithmItem {
std::string algorithm_name;
std::vector<DataItem> data_items;
AlgorithmItem() {};
AlgorithmItem(std::string a_name) : algorithm_name{a_name} {};
AlgorithmItem(std::string a_name, std::vector<DataItem> d_items) :
algorithm_name{a_name},
data_items{d_items}
{};
std::string to_json_string() {
std::string output;
output += "{\"algorithm_name\" : \"" + algorithm_name +
"\", \"data_items\" : [";
for (auto di : data_items) {
output += di.to_json_string() + ",";
}
output.pop_back();
output += "]}";
return output;
};
std::string to_csv_string() {
std::string output;
for (auto di : data_items) {
output += algorithm_name + "," + di.to_csv_string() + "\n";
}
output.pop_back();
return output;
}
};
struct Experiment {
double dataset_parsing_time;
std::vector<AlgorithmItem> algorithm_executions;
Experiment() {};
Experiment(double p_time) : dataset_parsing_time{p_time} {};
std::string to_json_string() {
std::string output;
output += "{\"dataset_parsing_time\" : " + std::to_string(dataset_parsing_time) +
", \"algorithm_executions\" : [";
for (auto a : algorithm_executions) {
output += a.to_json_string() + ",";
}
output.pop_back();
output += "]}";
return output;
};
std::string to_csv_string() {
std::string output;
for (auto a : algorithm_executions) {
output += a.to_csv_string() + "\n";
}
output.pop_back();
return output;
};
};
template <typename Label, typename Algorithm, double (Algorithm::*_ted)(const node::Node<Label>&, const node::Node<Label>&)>
DataItem execute_ted_alg(const unsigned int t1_id, const unsigned int t2_id, const node::Node<Label>& t1, const node::Node<Label>& t2) {
Algorithm a;
Timing timing;
Timing::Interval * alg_time = timing.create_enroll("AlgorithmRuntime");
alg_time->start();
auto d = (a.*_ted)(t1, t2);
alg_time->stop();
auto sp = a.get_subproblem_count();
DataItem di(t1_id, t2_id, t1.get_tree_size(), t2.get_tree_size(), d, alg_time->getfloat(), sp);
return di;
};
template <typename Label, typename Algorithm, double (Algorithm::*_ted)(const node::Node<Label>&, const node::Node<Label>&)>
std::vector<DataItem> execute_overlaping_pairs(std::vector<node::Node<Label>>& trees_collection) {
std::vector<DataItem> execution_results;
auto tc_start_it = std::begin(trees_collection);
auto tc_end_it = std::end(trees_collection);
unsigned int tree_id = 0;
while (tc_start_it < tc_end_it-1) {
auto t1 = *tc_start_it;
auto t2 = *(tc_start_it+1);
execution_results.push_back(execute_ted_alg<Label, Algorithm, _ted>(tree_id, tree_id+1, t1, t2));
++tc_start_it;
++tree_id;
}
return execution_results;
};
template <typename Label, typename Algorithm, double (Algorithm::*_ted)(const node::Node<Label>&, const node::Node<Label>&, const int k)>
DataItem execute_ted_alg_k(const unsigned int t1_id, const unsigned int t2_id, const node::Node<Label>& t1, const node::Node<Label>& t2, const int k) {
Algorithm a;
Timing timing;
Timing::Interval * alg_time = timing.create_enroll("AlgorithmRuntime");
alg_time->start();
auto d = (a.*_ted)(t1, t2, k);
alg_time->stop();
auto sp = a.get_subproblem_count();
DataItem di(t1_id, t2_id, t1.get_tree_size(), t2.get_tree_size(), d, alg_time->getfloat(), sp);
return di;
};
template <typename Label, typename Algorithm, double (Algorithm::*_ted)(const node::Node<Label>&, const node::Node<Label>&, const int k)>
std::vector<DataItem> execute_overlaping_pairs_k(std::vector<node::Node<Label>>& trees_collection, const int k) {
std::vector<DataItem> execution_results;
auto tc_start_it = std::begin(trees_collection);
auto tc_end_it = std::end(trees_collection);
unsigned int tree_id = 0;
while (tc_start_it < tc_end_it-1) {
auto t1 = *tc_start_it;
auto t2 = *(tc_start_it+1);
execution_results.push_back(execute_ted_alg_k<Label, Algorithm, _ted>(tree_id, tree_id+1, t1, t2, k));
++tc_start_it;
++tree_id;
}
return execution_results;
};
int main(int argc, char** argv) {
using Label = label::StringLabel;
using CostModel = cost_model::UnitCostModel<Label>;
// TED algorithms.
using ZhangShasha = ted::ZhangShasha<Label, CostModel>;
using APTED = ted::APTED<Label, CostModel>;
// TED-k algorithms.
using Touzet = ted::Touzet<Label, CostModel>;
// TODO: LabelGuided has no functions with the used signatures.
// using LabelGuided = ted_ub::GreedyUB<Label, CostModel>;
Timing timing;
// node::Node<label::StringLabel> t1(label::StringLabel("t1"));
// node::Node<label::StringLabel> t2(label::StringLabel("t2"));
// int k = 1;
// Write results as JSON object to stdout.
// QUESTION: What's the best way of building a string output?
// Path to file containing the input trees.
std::string input_file_path;
// // Experiment mechanism.
// //
// Possible execution mechanisms over the input dataset:
// --overlaping-pairs (default) First with second, second with third, and so on.
// std::string experiment_mechanism;
// Set similarity threshold - maximum number of allowed edit operations.
// TODO: If no threshold is provided, it should be set to something reasonable (default).
int similarity_threshold;
// Names of the used TED agorithms (a list).
//
// Possible algorithms (command line arguments):
// --zs Zhang and Shasha zhang_shasha_ted
// --apted APTED apted_ted
// --tz Touzet - basic version touzet_ted
// --tzd Touzet - depth-based pruning touzet_ted_depth_pruning
// --tzs Touzet - keyroot nodes with set touzet_ted_kr_loop
// --tzl Touzet - keyroot nodes with loop touzet_ted_kr_set
// --lg LabelGuided TODO
// --lgd LabelGuided - deprecated TODO
bool alg_zs_is_set = false;
bool alg_apted_is_set = false;
bool alg_tz_is_set = false;
bool alg_tzd_is_set = false;
bool alg_tzs_is_set = false;
bool alg_tzl_is_set = false;
// Output format
bool output_in_json = false;
bool output_in_csv = false;
// Parse command line arguments.
// TODO: Check for possible errors.
std::vector<std::string> args(argv, argv + argc);
auto args_start_it = std::begin(args);
auto args_end_it = std::end(args);
while (args_start_it != args_end_it) {
std::string a = *args_start_it;
if (a == "--input") {
++args_start_it;
input_file_path = *args_start_it;
}
if (a == "--threshold") {
++args_start_it;
similarity_threshold = static_cast <int> (std::ceil(std::stod(*args_start_it)));
}
if (a == "--zs") {
alg_zs_is_set = true;
}
if (a == "--apted") {
alg_apted_is_set = true;
}
if (a == "--tz") {
alg_tz_is_set = true;
}
if (a == "--tzd") {
alg_tzd_is_set = true;
}
if (a == "--tzs") {
alg_tzs_is_set = true;
}
if (a == "--tzl") {
alg_tzl_is_set = true;
}
if (a == "--output") {
++args_start_it;
a = *args_start_it;
if (a == "json") {
output_in_json = true;
} else if (a == "csv") {
output_in_csv = true;
}
}
++args_start_it;
}
Experiment experiment;
// PARSE INPUT
// The input is parsed once for the entire experiment.
std::vector<node::Node<Label>> trees_collection;
{
Timing::Interval * parse = timing.create_enroll("Parse");
parse->start();
parser::BracketNotationParser bnp;
bnp.parse_collection(trees_collection, input_file_path);
parse->stop();
experiment.dataset_parsing_time = parse->getfloat();
}
// EXECUTE ALGORITHMS
if (alg_zs_is_set) {
experiment.algorithm_executions.emplace_back("ZhangShasha",
execute_overlaping_pairs<Label, ZhangShasha, &ZhangShasha::zhang_shasha_ted>(trees_collection));
}
if (alg_apted_is_set) {
experiment.algorithm_executions.emplace_back("APTED",
execute_overlaping_pairs<Label, APTED, &APTED::apted_ted>(trees_collection));
}
if (alg_tz_is_set) {
experiment.algorithm_executions.emplace_back("Touzet",
execute_overlaping_pairs_k<Label, Touzet, &Touzet::touzet_ted>(trees_collection, similarity_threshold));
}
if (alg_tzd_is_set) {
experiment.algorithm_executions.emplace_back("TouzetDP",
execute_overlaping_pairs_k<Label, Touzet, &Touzet::touzet_ted_depth_pruning>(trees_collection, similarity_threshold));
}
if (alg_tzs_is_set) {
experiment.algorithm_executions.emplace_back("TouzetKrLoop",
execute_overlaping_pairs_k<Label, Touzet, &Touzet::touzet_ted_kr_loop>(trees_collection, similarity_threshold));
}
if (alg_tzl_is_set) {
experiment.algorithm_executions.emplace_back("TouzetKrSet",
execute_overlaping_pairs_k<Label, Touzet, &Touzet::touzet_ted_kr_set>(trees_collection, similarity_threshold));
}
// OUTPUT RESULTS
if (output_in_json) {
std::cout << experiment.to_json_string() << std::endl;
}
if (output_in_csv) {
std::cout << experiment.to_csv_string() << std::endl;
}
return 0;
}
// The MIT License (MIT)
// Copyright (c) 2017 Mateusz Pawlik
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
/// \file src/ted-algs/ted_algs_experiments.h
///
/// \details
/// Implements an experimental environment that executes ted algorithms.
#ifndef TED_ALGS_EXPERIMENTS_H
#define TED_ALGS_EXPERIMENTS_H
#include <iostream>
#include <time.h>
#include <string>
#include <vector>
#include <unordered_set>
#include "timing.h"
#include "node.h"
#include "string_label.h"
#include "unit_cost_model.h"
#include "zhang_shasha.h"
#include "touzet.h"
#include "greedy_ub.h"
#include "apted.h"
#include "bracket_notation_parser.h"
#endif // TED_ALGS_EXPERIMENTS_H
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment