'''
------------------------------------------------------------------------------
levelUp
    command-line arguments:
        level:     space-sep file of all frequent itemsets of given length k
        trans:     the transaction itemsets
        threshold: the support count requested

------------------------------------------------------------------------------
The input files for level and trans should
* ascii or utf-8 ("txt" file),
* be one itemset per line,
* have items space separated,
* each item is a positive "integer", and
* the items per line should be ordered (e.g., 1 3 5 7 11 ...)

------------------------------------------------------------------------------
       author: parke godfrey
     creation: 2021-01-30
last modified: 2021-02-01
     language: python 3
      license: CC, any reuse and modification, with attribution
------------------------------------------------------------------------------
'''
#=============================================================================
import sys
import time

startTime = time.time()

#=============================================================================
# ItemsetDTree(k):
#   * stores itemsets of length k into a hash / dictionary tree
#   * can handle support-counting by feeding in transaction sets
#       * the leaves of the tree are int's; each branch encodes an itemset

class ItemsetDTree:
    iDTree = {}
    iLen   = None

    #-------------------------------------------------------------------------
    # add:
    #     add an itemset to the dictionary-tree

    def add(self, iset, dtree=None):
        if dtree is None: # this is the root call
            dtree = self.iDTree
            if len(iset) != self.iLen:
                print('Trying to add itemset of length %d ' % len(iset)
                      + 'to Tree with item length %d!' % self.iLen)
                exit(-1)
        if len(iset) == 1:
            if iset[0] not in dtree: # first time seen
                dtree[iset[0]] = 0
        else:
            if iset[0] in dtree: # add rest to existing subtree
                self.add(iset[1:],
                         dtree=dtree[iset[0]])
            else:                # build new subtree
                dtree[iset[0]] = self.add(iset[1:],
                                          dtree={})
        return dtree

    #-------------------------------------------------------------------------
    # writeSupported:
    #     print out the itemsets encoded in a dictionary-tree
    #         reportSupport
    #             True:  first int per line ("itemset") is the support count
    #             False: support count not written

    def writeSupported(self,
                       threshold=0,
                       branch='',
                       reportSupport=False,
                       dtree=None):
        if dtree is None: # this is the root call
            dtree = self.iDTree
        for item in sorted(dtree.keys()): # so they come out in sorted order
            if isinstance(dtree[item], int): # at leaf, so write
                if threshold <= dtree[item]:
                    if reportSupport:
                        print(("%d %s %d"
                                % (dtree[item], branch, item)).strip())
                    else:
                        print(("%s %d" % (branch, item)).strip())
            else:                            # recurse down
                self.writeSupported(threshold=threshold,
                                    branch=(branch + ' ' + str(item)),
                                    reportSupport=reportSupport,
                                    dtree=dtree[item])

    #-------------------------------------------------------------------------
    # supportIncr:
    #     increments the support count of each set in the dictionary tree
    #     that is a subset of the transaction set

    def supportIncr(self, tset, dtree=None):
        if dtree is None: # this is the root call
            dtree = self.iDTree
        if len(tset) == 0:
            return;
        if tset[0] in dtree:
            if isinstance(dtree[tset[0]], int):
                dtree[tset[0]] += 1;
            else:
                self.supportIncr(tset[1:],
                                 dtree=dtree[tset[0]])
        self.supportIncr(tset[1:],
                         dtree=dtree)

    #-------------------------------------------------------------------------
    # INIT

    def __init__(self, length):
        if length <= 0:
            print('Itemset length for tree must be positive!')
            exit(-1)
        self.iLen = length

#=============================================================================
# FUNCTIONS

# itemset / iset: an ordered list of int's representing an itemset
#                 we use int's not strings to make this more efficient

def istrClean(istr):
    istr = ' '.join(istr.strip('\n').split())
    for token in istr.split():
        if not token.isnumeric():
            print('Item "%s" is not an integer!' % token)
            exit(-1)
    return istr

def string2iset(istr):
    return [int(token) for token in istr.split()]

def iset2string(iset):
    itok = [str(item) for item in iset]
    return ' '.join(itok)

#=============================================================================
# MAIN

if __name__ == '__main__':
    k       = 0 # length of itemsets in input level file
    preCNT  = 0 # running tally of pre-candidates produced
    candCNT = 0 # running tally of candidates (after apriori)

    # read first line of level file to measure itemset length
    with open(sys.argv[1], 'r', encoding='utf-8') as inLev:
        for line in inLev:
            line = istrClean(line)
            itemset = string2iset(line.strip('\n'))
            k = len(itemset)
            break
        else:
            print('Empty level file!');
            exit();

    # read in level file with the frequent itemsets of size k
    #   * store a signature (a string) of the itemset's "prefix", all save
    #     the last item, in a hash / dictionary, to enumerate through
    #     later to do the pre-candidate generation via joining
    #   * store a signature (a string) of the itemset for easy
    #     existence checking during apriori checking
    prefixDict  = {}
    itemsetDict = {}
    with open(sys.argv[1], 'r', encoding='utf-8') as inLev:
        for line in inLev:
            line = istrClean(line)
            itemset = string2iset(line.strip('\n'))
            if len(itemset) != k:
                print('Itemset not of length %d read!' % k)
                exit(-1)
            # add the itemset into the prefix signature dictionary
            prefix = iset2string(itemset[:-1])
            if prefix in prefixDict:     # entry already there
                prefixDict[prefix].append(itemset[-1])
            else:                        # create new entry
                prefixDict[prefix] = [itemset[-1]]
            # add the itemset into the itemset signature dictionary
            signature = iset2string(itemset)
            if signature in itemsetDict: # entry seen before
                itemsetDict[signature] += 1
            else:                        # create new entry
                itemsetDict[signature]  = 1

    # if incoming level of itemsets only has one thing, no new candidates!
    if len(itemsetDict) < 2:
        exit()

    # put the final-item list per prefix in sorted order
    for prefix in prefixDict:
        prefixDict[prefix] = sorted(prefixDict[prefix])

    # join on prefixes to create pre-candidates
    # do apriori checks to filter down to candidates
    #   * store the found candidates into a dictionary tree;
    #     their support counts then can be computed efficiently
    #     by iterating over the transactions
    candidates = ItemsetDTree(k+1)
    prefixes   = sorted(prefixDict.keys())
    for prefix in prefixes:
        # join the itemsets with a common prefix
        for i in range(len(prefixDict[prefix]) - 1):
            for j in range(i + 1, len(prefixDict[prefix])):
                itemset = string2iset(prefix)
                itemset.append(prefixDict[prefix][i])
                itemset.append(prefixDict[prefix][j])
                preCNT += 1

                # apriori check the joined pre-candidate!
                apriori = True
                for g in range(len(itemset) - 2):
                    subset = itemset[:g] + itemset[g+1:]
                    if iset2string(subset) not in itemsetDict:
                        apriori = False
                        break
                if apriori:
                    candidates.add(itemset)
                    candCNT += 1

    # read in the transaction itemsets, calculate the support counts
    # for the candidates
    with open(sys.argv[2], 'r', encoding='utf-8') as inTrans:
        for line in inTrans:
            line = istrClean(line)
            itemset = string2iset(line.strip('\n'))
            candidates.supportIncr(itemset)

    # print out the supported items
    candidates.writeSupported(threshold=int(sys.argv[3]),
                              reportSupport=True)

    endTime = time.time()
    print('')
    print('#pre-candidates: %d'   % preCNT)
    print('#candidates:     %d'   % candCNT)
    print('Lapsed time:     %.3f' % (endTime - startTime))