#!/usr/bin/env python

import sys
import re

LITERAL_RE = re.compile("(^|[-\s<>*=()+,]+?)(?:-?\d+(?:\.\d+)?|'(?:''|[^'])*')(?=[-\s<>*=()+,;]|$)")
DECLARE_RE = re.compile("^(DECLARE )\"[^\"]*\"( .*)$")
FETCH_RE = re.compile("^(FETCH (?:ALL|\d+) FROM )\"[^\"]*\"$")
CLOSE_RE = re.compile("^(CLOSE )\"[^\"]*\"$")
def make_generic(query):
    if query.startswith('DECLARE'):
        query = DECLARE_RE.sub('\\1?\\2', query)
    elif query.startswith('FETCH'):
        query = FETCH_RE.sub('\\1?', query)
    elif query.startswith('CLOSE'):
        query = CLOSE_RE.sub('\\1?', query)
    return LITERAL_RE.sub('\\1?', query)

def parse_data(f):
    PREFIX = '(?:\S+\s+){4}postgres\[(\d+)\]:\s+\S+\s+'
    STATEMENT_RE = re.compile('^' + PREFIX + '(\S+)\s+LOG:\s+statement:\s+(.*)$')
    STATEMENT_CONT_RE = re.compile('^(?:\S+\s+){4}postgres\[(\d+)\]:\s+\S+[ ]?(.*)$')
    DURATION_RE = re.compile('^' + PREFIX + '(\S+)\s+LOG:\s+duration:\s+(.*)\s+ms$')
    SKIP_RE = re.compile('^' + PREFIX + '(?:\S+\s+)(?:DETAIL.*:|LOG:\s+(?:QUERY STATISTICS|connection.*|disconnection.*|database.*|checkpoint.*|redo.*|next.*|unexpected.*)|WARNING:.*|HINT:.*|ERROR:.*|STATEMENT:.*)$')
    
    query_stack = {}
    queries = {}
    skipping = {}
    lines = 0
    
    for l in f.xreadlines():
        lines += 1

        m = STATEMENT_RE.match(l)
        if m:
	    skipping[m.group(1)] = False
	    if m.group(1) in query_stack:
		query_stack[m.group(1)].append(m.group(3))
	    else:
		query_stack[m.group(1)] = [m.group(3)]
            continue

        m = DURATION_RE.match(l)
        if m:
	    skipping[m.group(1)] = False

	    if m.group(1) not in query_stack:
		continue

	    try:
                g = make_generic(query_stack[m.group(1)].pop(0))
	    except IndexError, ie:
	        # the statement line must have been lost somehow
		sys.stderr.write("WARN: Missing statement for %s\n" % l[:-1])
		g = None
	    except:
	        sys.stderr.write("%s\n" % l[:-1])
		sys.stderr.write("%s\n" % m.group(1))
		sys.stderr.write("%s\n" % query_stack[m.group(1)])
		raise
            if g in queries:
                queries[g].append((m.group(2), float(m.group(3))))
            else:
                queries[g] = [(m.group(2), float(m.group(3)))]
            continue

        m = SKIP_RE.match(l)
        if m:
	    skipping[m.group(1)] = True
            continue

        m = STATEMENT_CONT_RE.match(l)
	if m:
	    if m.group(1) in skipping and skipping[m.group(1)]:
	        continue
	    if m.group(1) in query_stack:
	        try:
	            query_stack[m.group(1)][-1] += ' ' + m.group(2).strip()
                except IndexError, ie:
		    sys.stderr.write("WARNING: skipping continuation of missing statement: %s\n" % l[:-1])
	    continue

        sys.stderr.write("LINE %d: '%s'\n" % (lines, l.strip()))
        raise RuntimeError, 'bad line in input'

    return queries

def main():
    queries = parse_data(sys.stdin)

    tot_time = 0.0
    for k in queries.keys():
        for d in queries[k][1]:
            tot_time += d
    tot_queries = reduce(lambda x, y: x + y, [len(queries[q]) for q in queries.keys()], 0)
    
    print "TOTAL QUERIES: %d generics from %d total" % (len(queries.keys()), tot_queries)
    print "TOTAL DB PIDS: %d" % (len(query_stack.keys()),)
    print "TOTAL DB TIME: %0.2fm" % (tot_time / 60000,)
    print "TPS : %0.2f / s" % ( ( (tot_time / tot_queries) / 10000 ), )
    
    print
    print "TOP 25 QUERIES BY FREQUENCY:"
    keys = sorted(queries.keys(), lambda x, y: cmp(len(queries[x]), len(queries[y])))
    for k in keys[-25:]:
        print "%d: %0.2fms: %0.2f%%: %0.2f%%: %s" % (len(queries[k]), 
            reduce(lambda x, y: x + y, queries[k]) / float(len(queries[k])),
            len(queries[k]) * 100.0 / tot_queries,
            reduce(lambda x, y: x + y, queries[k]) * 100.0 / tot_time,
            k)
    
    print
    print "TOP 25 QUERIES BY AVG DURATION:"
    keys = sorted(queries.keys(), lambda x, y: cmp(
        reduce(lambda x, y: x + y, queries[x]) / float(len(queries[x])), 
        reduce(lambda x, y: x + y, queries[y]) / float(len(queries[y]))))
    for k in keys[-250:]:
        print "%d: %0.2fms: %0.2f%%: %0.2f%%: %s" % (len(queries[k]), 
            reduce(lambda x, y: x + y, queries[k]) / float(len(queries[k])), 
            len(queries[k]) * 100.0 / tot_queries,            
            reduce(lambda x, y: x + y, queries[k]) * 100.0 / tot_time,            
            k)
            
    print
    print "TOP 25 QUERIES BY TOTAL TIME"
    keys = sorted(queries.keys(), lambda x, y: cmp(
        reduce(lambda x, y: x + y, queries[x]), 
        reduce(lambda x, y: x + y, queries[y])))
    for k in keys[-50:]:
        print "%d: %0.2fms: %0.2f%%: %0.2f%% %s" % (len(queries[k]), 
            reduce(lambda x, y: x + y, queries[k]) / float(len(queries[k])), 
            len(queries[k]) * 100.0 / tot_queries,
            reduce(lambda x, y: x + y, queries[k]) * 100.0 / tot_time,            
            k)

    
if __name__ == '__main__':
    main()
