Loading statistics/statistics.py +50 −7 Original line number Original line Diff line number Diff line Loading @@ -14,16 +14,47 @@ import sys import argparse import argparse import json import json import re import re import os import psycopg2 from psycopg2 import sql # http://initd.org/psycopg/docs/sql.html#module-psycopg2.sql def store_result(table_name, values_dict): # Connect to database. db = psycopg2.connect("service=ted-join") # Open a cursor to perform database operations cur = db.cursor() attributes = values_dict.keys() query = sql.SQL("INSERT INTO {} ({}) VALUES ({})").format( sql.Identifier(table_name), sql.SQL(', ').join(map(sql.Identifier, attributes)), sql.SQL(', ').join(map(sql.Placeholder, attributes)) ) print(query.as_string(cur)) cur.execute(query, values_dict) db.commit() # Close the cursor. cur.close() # Close communication with the database. db.close() # parse input argurments # parse input argurments parser = argparse.ArgumentParser() parser = argparse.ArgumentParser() parser.add_argument("--inputfile", type=str, parser.add_argument("--inputfile", type=str, help="path to input files containing line seperated trees in bracket notation") help="path to input files containing line seperated trees in bracket notation") parser.add_argument("--shortdesc", type=str, default="no description", help="short description about the dataset") parser.add_argument("--printlabels", action='store_true', dest='printlabels', parser.add_argument("--printlabels", action='store_true', dest='printlabels', help="print label distribution") help="print label distribution") parser.add_argument("--storeindb", action='store_true', dest='storeindb', help="store the information about the dataset in a database") args = parser.parse_args() args = parser.parse_args() db_table_name = "dataset" # name of the table in the database short_description = args.shortdesc path, filename = os.path.split(args.inputfile) labels = {} # dict that holds a counter for each node label labels = {} # dict that holds a counter for each node label number_of_trees = 0 number_of_trees = 0 sum_of_tree_sizes = 0 sum_of_tree_sizes = 0 Loading Loading @@ -53,14 +84,26 @@ with open(args.inputfile) as f: labels[label] = labels.get(label, 0) + 1 labels[label] = labels.get(label, 0) + 1 # create json that holds all information about the dataset dataset_info = "{" dataset_info += "\"filename\": \"" + str(filename) + "\"" dataset_info += ", \"short_description\": \"" + str(short_description) + "\"" dataset_info += ", \"number_of_trees\": " + str(number_of_trees) dataset_info += ", \"avg_tree_size\": " + str(sum_of_tree_sizes/number_of_trees) dataset_info += ", \"min_tree_size\": " + str(min_tree_size) dataset_info += ", \"max_tree_size\": " + str(max_tree_size) dataset_info += ", \"different_label\": " + str(len(labels)) dataset_info += "}" # print result to stdout # print result to stdout print("{\"number_of_trees\": " + str(number_of_trees), end='') print(dataset_info) print(", \"avg_tree_size\": " + str(sum_of_tree_sizes/number_of_trees), end='') print(", \"min_tree_size\": " + str(min_tree_size), end='') # store dataset in db print(", \"max_tree_size\": " + str(max_tree_size), end='') if args.storeindb: print(", \"different_label\": " + str(len(labels)), end='') store_result(db_table_name, dataset_info) # print label distribution if args.printlabels: if args.printlabels: print(", ") print(" ") print("\"labels\": ", end='') print("\"labels\": ", end='') print(str(labels).replace("\'", "\"")) print(str(labels).replace("\'", "\"")) print("}") No newline at end of file Loading
statistics/statistics.py +50 −7 Original line number Original line Diff line number Diff line Loading @@ -14,16 +14,47 @@ import sys import argparse import argparse import json import json import re import re import os import psycopg2 from psycopg2 import sql # http://initd.org/psycopg/docs/sql.html#module-psycopg2.sql def store_result(table_name, values_dict): # Connect to database. db = psycopg2.connect("service=ted-join") # Open a cursor to perform database operations cur = db.cursor() attributes = values_dict.keys() query = sql.SQL("INSERT INTO {} ({}) VALUES ({})").format( sql.Identifier(table_name), sql.SQL(', ').join(map(sql.Identifier, attributes)), sql.SQL(', ').join(map(sql.Placeholder, attributes)) ) print(query.as_string(cur)) cur.execute(query, values_dict) db.commit() # Close the cursor. cur.close() # Close communication with the database. db.close() # parse input argurments # parse input argurments parser = argparse.ArgumentParser() parser = argparse.ArgumentParser() parser.add_argument("--inputfile", type=str, parser.add_argument("--inputfile", type=str, help="path to input files containing line seperated trees in bracket notation") help="path to input files containing line seperated trees in bracket notation") parser.add_argument("--shortdesc", type=str, default="no description", help="short description about the dataset") parser.add_argument("--printlabels", action='store_true', dest='printlabels', parser.add_argument("--printlabels", action='store_true', dest='printlabels', help="print label distribution") help="print label distribution") parser.add_argument("--storeindb", action='store_true', dest='storeindb', help="store the information about the dataset in a database") args = parser.parse_args() args = parser.parse_args() db_table_name = "dataset" # name of the table in the database short_description = args.shortdesc path, filename = os.path.split(args.inputfile) labels = {} # dict that holds a counter for each node label labels = {} # dict that holds a counter for each node label number_of_trees = 0 number_of_trees = 0 sum_of_tree_sizes = 0 sum_of_tree_sizes = 0 Loading Loading @@ -53,14 +84,26 @@ with open(args.inputfile) as f: labels[label] = labels.get(label, 0) + 1 labels[label] = labels.get(label, 0) + 1 # create json that holds all information about the dataset dataset_info = "{" dataset_info += "\"filename\": \"" + str(filename) + "\"" dataset_info += ", \"short_description\": \"" + str(short_description) + "\"" dataset_info += ", \"number_of_trees\": " + str(number_of_trees) dataset_info += ", \"avg_tree_size\": " + str(sum_of_tree_sizes/number_of_trees) dataset_info += ", \"min_tree_size\": " + str(min_tree_size) dataset_info += ", \"max_tree_size\": " + str(max_tree_size) dataset_info += ", \"different_label\": " + str(len(labels)) dataset_info += "}" # print result to stdout # print result to stdout print("{\"number_of_trees\": " + str(number_of_trees), end='') print(dataset_info) print(", \"avg_tree_size\": " + str(sum_of_tree_sizes/number_of_trees), end='') print(", \"min_tree_size\": " + str(min_tree_size), end='') # store dataset in db print(", \"max_tree_size\": " + str(max_tree_size), end='') if args.storeindb: print(", \"different_label\": " + str(len(labels)), end='') store_result(db_table_name, dataset_info) # print label distribution if args.printlabels: if args.printlabels: print(", ") print(" ") print("\"labels\": ", end='') print("\"labels\": ", end='') print(str(labels).replace("\'", "\"")) print(str(labels).replace("\'", "\"")) print("}") No newline at end of file