Commit 0b033515 authored by Thomas Huetter's avatar Thomas Huetter
Browse files

statistics.py: write dataset information into a database

parent 19f39918
Loading
Loading
Loading
Loading
+50 −7
Original line number Original line Diff line number Diff line
@@ -14,16 +14,47 @@ import sys
import argparse
import argparse
import json
import json
import re
import re
import os
import psycopg2
from psycopg2 import sql

# http://initd.org/psycopg/docs/sql.html#module-psycopg2.sql
def store_result(table_name, values_dict):
  # Connect to database.
  db = psycopg2.connect("service=ted-join")
  # Open a cursor to perform database operations
  cur = db.cursor()
  attributes = values_dict.keys()
  query = sql.SQL("INSERT INTO {} ({}) VALUES ({})").format(
      sql.Identifier(table_name),
      sql.SQL(', ').join(map(sql.Identifier, attributes)),
      sql.SQL(', ').join(map(sql.Placeholder, attributes))
  )
  print(query.as_string(cur))
  cur.execute(query, values_dict)
  db.commit()
  # Close the cursor.
  cur.close()
  # Close communication with the database.
  db.close()



# parse input argurments
# parse input argurments
parser = argparse.ArgumentParser()
parser = argparse.ArgumentParser()
parser.add_argument("--inputfile", type=str, 
parser.add_argument("--inputfile", type=str, 
  help="path to input files containing line seperated trees in bracket notation")
  help="path to input files containing line seperated trees in bracket notation")
parser.add_argument("--shortdesc", type=str, default="no description",
  help="short description about the dataset")
parser.add_argument("--printlabels", action='store_true', dest='printlabels', 
parser.add_argument("--printlabels", action='store_true', dest='printlabels', 
  help="print label distribution")
  help="print label distribution")
parser.add_argument("--storeindb", action='store_true', dest='storeindb', 
  help="store the information about the dataset in a database")


args = parser.parse_args()
args = parser.parse_args()


db_table_name = "dataset" # name of the table in the database
short_description = args.shortdesc
path, filename = os.path.split(args.inputfile)
labels = {} # dict that holds a counter for each node label
labels = {} # dict that holds a counter for each node label
number_of_trees = 0
number_of_trees = 0
sum_of_tree_sizes = 0
sum_of_tree_sizes = 0
@@ -53,14 +84,26 @@ with open(args.inputfile) as f:
      labels[label] = labels.get(label, 0) + 1
      labels[label] = labels.get(label, 0) + 1




# create json that holds all information about the dataset
dataset_info = "{"
dataset_info += "\"filename\": \"" + str(filename) + "\""
dataset_info += ", \"short_description\": \"" + str(short_description) + "\""
dataset_info += ", \"number_of_trees\": " + str(number_of_trees)
dataset_info += ", \"avg_tree_size\": " + str(sum_of_tree_sizes/number_of_trees)
dataset_info += ", \"min_tree_size\": " + str(min_tree_size)
dataset_info += ", \"max_tree_size\": " + str(max_tree_size)
dataset_info += ", \"different_label\": " + str(len(labels))
dataset_info += "}"

# print result to stdout
# print result to stdout
print("{\"number_of_trees\": " + str(number_of_trees), end='')
print(dataset_info)
print(", \"avg_tree_size\": " + str(sum_of_tree_sizes/number_of_trees), end='')

print(", \"min_tree_size\": " + str(min_tree_size), end='')
# store dataset in db
print(", \"max_tree_size\": " + str(max_tree_size), end='')
if args.storeindb:
print(", \"different_label\": " + str(len(labels)), end='')
  store_result(db_table_name, dataset_info)

# print label distribution
if args.printlabels:
if args.printlabels:
  print(", ")
  print(" ")
  print("\"labels\": ", end='')
  print("\"labels\": ", end='')
  print(str(labels).replace("\'", "\""))
  print(str(labels).replace("\'", "\""))
print("}")
 No newline at end of file