statistics.py 3.49 KB
Newer Older
1
2
3
4
5
6
7
#!/usr/bin/env python
'''
    File name: local_experiments.py
    Author: Thomas Huetter
    Program: Wrapper script to call ../build/ted-join-experiments. Instead 
             of writing tothe database, the output is written to stdout in 
             json format. Called from /src/.
Thomas Huetter's avatar
Thomas Huetter committed
8
9
10
    TODO: - Quantile, Median
          - Depth, Fanout
          - Percentage leafs
11
12
13
14
15
16
'''

import sys
import argparse
import json
import re
17
18
19
20
21
import os
import psycopg2
from psycopg2 import sql

# http://initd.org/psycopg/docs/sql.html#module-psycopg2.sql
22
def store_result(service, table_name, values_dict):
23
  # Connect to database.
24
  db = psycopg2.connect("service=" + service)
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
  # Open a cursor to perform database operations
  cur = db.cursor()
  attributes = values_dict.keys()
  query = sql.SQL("INSERT INTO {} ({}) VALUES ({})").format(
      sql.Identifier(table_name),
      sql.SQL(', ').join(map(sql.Identifier, attributes)),
      sql.SQL(', ').join(map(sql.Placeholder, attributes))
  )
  print(query.as_string(cur))
  cur.execute(query, values_dict)
  db.commit()
  # Close the cursor.
  cur.close()
  # Close communication with the database.
  db.close()

41
42
43

# parse input argurments
parser = argparse.ArgumentParser()
44
parser.add_argument("--inputfile", type=str, required=True, 
45
  help="path to input files containing line seperated trees in bracket notation")
46
47
parser.add_argument("--shortdesc", type=str, default="no description",
  help="short description about the dataset")
48
49
parser.add_argument("--printlabels", action='store_true', dest='printlabels', 
  help="print label distribution")
50
51
parser.add_argument('--service', type=str, 
  help="Service name for db connection. Specify this parameter to store the results in a database.")
52
53
54

args = parser.parse_args()

55
56
57
db_table_name = "dataset" # name of the table in the database
short_description = args.shortdesc
path, filename = os.path.split(args.inputfile)
58
59
60
61
62
63
64
65
66
67
labels = {} # dict that holds a counter for each node label
number_of_trees = 0
sum_of_tree_sizes = 0
min_tree_size = 0
max_tree_size = 0

# open inputfile
with open(args.inputfile) as f:
  # for each tree in the inputfile
  for line in f:
68
    # remove all '}', split line by '{' and remove empty entries at the beginning and end
69
    nodes = re.split(r'[{]', re.sub('[\}]', '', line.rstrip()))[1:]
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
    # get tree size
    tree_size = len(nodes)
    # sum to compute average
    sum_of_tree_sizes += tree_size

    # do statistics
    number_of_trees += 1
    if min_tree_size == 0 or tree_size < min_tree_size:
      min_tree_size = tree_size
    if tree_size > max_tree_size:
      max_tree_size = tree_size

    # take a look at each node label
    for label in nodes:
      labels[label] = labels.get(label, 0) + 1


87
88
89
90
# create json that holds all information about the dataset
dataset_info = "{"
dataset_info += "\"filename\": \"" + str(filename) + "\""
dataset_info += ", \"short_description\": \"" + str(short_description) + "\""
91
dataset_info += ", \"number_trees\": " + str(number_of_trees)
92
93
94
dataset_info += ", \"avg_tree_size\": " + str(sum_of_tree_sizes/number_of_trees)
dataset_info += ", \"min_tree_size\": " + str(min_tree_size)
dataset_info += ", \"max_tree_size\": " + str(max_tree_size)
95
dataset_info += ", \"number_labels\": " + str(len(labels))
96
97
dataset_info += "}"

98
# print result to stdout
99
100
101
print(dataset_info)

# store dataset in db
102
103
if args.service:
  store_result(args.service, db_table_name, json.loads(dataset_info))
104
105

# print label distribution
106
if args.printlabels:
107
  print(" ")
108
109
  print("\"labels\": ", end='')
  print(str(labels).replace("\'", "\""))