Commit 0b033515 authored by Thomas Huetter's avatar Thomas Huetter

statistics.py: write dataset information into a database

parent 19f39918
...@@ -14,16 +14,47 @@ import sys ...@@ -14,16 +14,47 @@ import sys
import argparse import argparse
import json import json
import re import re
import os
import psycopg2
from psycopg2 import sql
# http://initd.org/psycopg/docs/sql.html#module-psycopg2.sql
def store_result(table_name, values_dict):
# Connect to database.
db = psycopg2.connect("service=ted-join")
# Open a cursor to perform database operations
cur = db.cursor()
attributes = values_dict.keys()
query = sql.SQL("INSERT INTO {} ({}) VALUES ({})").format(
sql.Identifier(table_name),
sql.SQL(', ').join(map(sql.Identifier, attributes)),
sql.SQL(', ').join(map(sql.Placeholder, attributes))
)
print(query.as_string(cur))
cur.execute(query, values_dict)
db.commit()
# Close the cursor.
cur.close()
# Close communication with the database.
db.close()
# parse input argurments # parse input argurments
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument("--inputfile", type=str, parser.add_argument("--inputfile", type=str,
help="path to input files containing line seperated trees in bracket notation") help="path to input files containing line seperated trees in bracket notation")
parser.add_argument("--shortdesc", type=str, default="no description",
help="short description about the dataset")
parser.add_argument("--printlabels", action='store_true', dest='printlabels', parser.add_argument("--printlabels", action='store_true', dest='printlabels',
help="print label distribution") help="print label distribution")
parser.add_argument("--storeindb", action='store_true', dest='storeindb',
help="store the information about the dataset in a database")
args = parser.parse_args() args = parser.parse_args()
db_table_name = "dataset" # name of the table in the database
short_description = args.shortdesc
path, filename = os.path.split(args.inputfile)
labels = {} # dict that holds a counter for each node label labels = {} # dict that holds a counter for each node label
number_of_trees = 0 number_of_trees = 0
sum_of_tree_sizes = 0 sum_of_tree_sizes = 0
...@@ -53,14 +84,26 @@ with open(args.inputfile) as f: ...@@ -53,14 +84,26 @@ with open(args.inputfile) as f:
labels[label] = labels.get(label, 0) + 1 labels[label] = labels.get(label, 0) + 1
# create json that holds all information about the dataset
dataset_info = "{"
dataset_info += "\"filename\": \"" + str(filename) + "\""
dataset_info += ", \"short_description\": \"" + str(short_description) + "\""
dataset_info += ", \"number_of_trees\": " + str(number_of_trees)
dataset_info += ", \"avg_tree_size\": " + str(sum_of_tree_sizes/number_of_trees)
dataset_info += ", \"min_tree_size\": " + str(min_tree_size)
dataset_info += ", \"max_tree_size\": " + str(max_tree_size)
dataset_info += ", \"different_label\": " + str(len(labels))
dataset_info += "}"
# print result to stdout # print result to stdout
print("{\"number_of_trees\": " + str(number_of_trees), end='') print(dataset_info)
print(", \"avg_tree_size\": " + str(sum_of_tree_sizes/number_of_trees), end='')
print(", \"min_tree_size\": " + str(min_tree_size), end='') # store dataset in db
print(", \"max_tree_size\": " + str(max_tree_size), end='') if args.storeindb:
print(", \"different_label\": " + str(len(labels)), end='') store_result(db_table_name, dataset_info)
# print label distribution
if args.printlabels: if args.printlabels:
print(", ") print(" ")
print("\"labels\": ", end='') print("\"labels\": ", end='')
print(str(labels).replace("\'", "\"")) print(str(labels).replace("\'", "\""))
print("}")
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment