Commit 4e0ee533 authored by Thomas Huetter's avatar Thomas Huetter
Browse files

statistics.py: added new statistics implementation

parent 4f71576b
Loading
Loading
Loading
Loading
+71 −0
Original line number Diff line number Diff line
#!/usr/bin/env python
'''
    File name: local_experiments.py
    Author: Thomas Huetter
    Program: Wrapper script to call ../build/ted-join-experiments. Instead 
             of writing tothe database, the output is written to stdout in 
             json format. Called from /src/.
'''

import sys
import argparse
import json
import re

def str2bool(v):
    if v.lower() in ('yes', 'true', 't', 'y', '1'):
        return True
    elif v.lower() in ('no', 'false', 'f', 'n', '0'):
        return False
    else:
        raise argparse.ArgumentTypeError('Boolean value expected.')

# parse input argurments
parser = argparse.ArgumentParser()
parser.add_argument("--inputfile", type=str, 
  help="path to input files containing line seperated trees in bracket notation")
parser.add_argument("--printlabels", action='store_true', dest='printlabels', 
  help="print label distribution")

args = parser.parse_args()

labels = {} # dict that holds a counter for each node label
number_of_trees = 0
sum_of_tree_sizes = 0
min_tree_size = 0
max_tree_size = 0

# open inputfile
with open(args.inputfile) as f:
  # for each tree in the inputfile
  for line in f:
    # split line by '{' and '}' and remove empty entries at the beginning and end
    nodes = re.split(r'[{}]+', line.strip())[1:-1]
    # get tree size
    tree_size = len(nodes)
    # sum to compute average
    sum_of_tree_sizes += tree_size

    # do statistics
    number_of_trees += 1
    if min_tree_size == 0 or tree_size < min_tree_size:
      min_tree_size = tree_size
    if tree_size > max_tree_size:
      max_tree_size = tree_size

    # take a look at each node label
    for label in nodes:
      labels[label] = labels.get(label, 0) + 1


# print result to stdout
print("{\"number_of_trees\": " + str(number_of_trees), end='')
print(", \"avg_tree_size\": " + str(sum_of_tree_sizes/number_of_trees), end='')
print(", \"min_tree_size\": " + str(min_tree_size), end='')
print(", \"max_tree_size\": " + str(max_tree_size), end='')
print(", \"different_label\": " + str(len(labels)), end='')
if args.printlabels:
  print(", ")
  print("\"labels\": ", end='')
  print(str(labels).replace("\'", "\""))
print("}")
 No newline at end of file