#!/usr/bin/python
#
# CDA tool: reads multiple bulk_extractor histogram files and outputs:
# stoplist.txt - list of email addresses on more than 1/3 of the disks
# targets.txt  - list of email addresses not on stoplist and the # of drives on which they appear.

import os,os.path,sys,re,glob
if os.getenv("DOMEX_HOME"):
    sys.path.append(os.getenv("DOMEX_HOME") + "/src/lib/") # add the library
sys.path.append("../lib/")      # add the library


corr_files = ['ccn_histogram.txt',
              'email_histogram.txt',
              'telephone_histogram.txt',
              'url_histogram.txt',
              'url_searches.txt']


def read_histogram(fn):
    hist = {}
    r = re.compile("n=(\d+)\t(.*)")
    for line in open(fn):
        m = r.search(line)
        if m:
            count = int(m.group(1))
            val   = m.group(2)
            hist[val] = count
    return hist
        
def read_stoplist(fn):
    r = re.compile("\t([^\t]+)\t")
    """Read a stoplist and return a set. If the line has two \t's on it, it is a
    feature file or a context stoplist. Currently we ignore the context."""
    ret = set()
    for line in open(fn):
        m = r.search(line)
        if m:
            ret.add(m.group(1))
        else:
            ret.add(line.strip())
    return ret
    


def cda2(dir1,dir2,fn,stoplist):
    """Perform a corrlation of the features in fn from dir1 with dir2
    taking into account the stoplist."""

    global options
    fn1 = os.path.join(dir1,fn)
    fn2 = os.path.join(dir2,fn)
    if not os.path.exists(fn1) or not os.path.exists(fn2):
        return None

    # Read both histograms
    h1 = read_histogram(fn1)
    h2 = read_histogram(fn2)
    ret = []                    # return values
    # Now find the correlating entities
    for (k,v1) in h1.iteritems():
        if k in stoplist: continue
        try:
            v2 = h2[k]
            ret.append((v1*v2,k))
        except KeyError:
            pass
    # inverse sort the results
    ret.sort(reverse=1)
    return ret

if(__name__=="__main__"):
    global options
    from optparse import OptionParser
    parser = OptionParser()
    parser.add_option("-d","--debug",action="store_true")
    parser.add_option("--output",help="Specify output file for report")
    parser.add_option("--stoplist",help="Specify a context stop list")
    parser.usage = "usage: %prog [options] <dir1> <dir2> "
    (options,args) = parser.parse_args()

    if len(args)!=2:
        parser.print_help()
        exit(1)

    # Verify the directories
    for d in args:
        if not os.path.isdir(d):
            print d,"is not a directory"
            exit(1)
        if not os.path.exists(os.path.join(d,"report.txt")):
            print d,"is not a bulk_extractor output directory"
            exit(1)
        if glob.glob(os.path.join(d,"*.00.txt")):
            print "bulk_extractor did not complete in ",d
            exit(1)
    
    out = sys.stdout
    if options.output:
        if os.path.exists(options.output):
            print options.output,"exists. Delete it first or specify another file."
            exit(1)
        out = open(options.output,"w")
    (dir1,dir2) = args[0:2]

    stoplist = set()
    if options.stoplist:
        stoplist = read_stoplist(options.stoplist)
        

    # Okay; let's do it!
    for fn in corr_files:
        ret = cda2(dir1,dir2,fn,stoplist)
        if not ret: continue
        out.write("\n\n")
        out.write("=== %s ===\n" % fn)
        for pair in ret:
            out.write("%d\t%s\n" % (pair[0],pair[1]))
        out.write("\n\n")
    
        
