import sys def GetHistogram(word, alphabet): ret = [0] * len(alphabet) for i in range(len(word)): pos = alphabet.find(word[i]) if pos != -1: ret[pos] += 1 return ret class DictTreeNode(dict): def __init__(self, wordlist, alphabet): dict.__init__(self) letteridx = FindMinBranchingLetter(wordlist, alphabet) self.letter = alphabet[letteridx] subdicts = dict() for word in wordlist: hist = GetHistogram(word, alphabet) subdicts.setdefault(hist[letteridx], list()).append(word) if len(alphabet) == 1: for ind in subdicts: self[ind] = subdicts[ind] else: for ind in subdicts: newalphabet = alphabet[0:letteridx] + alphabet[letteridx + 1:] self[ind] = DictTreeNode(subdicts[ind], newalphabet) def ArgMin(l): min = sys.maxint minidx = -1 for i in range(len(l)): if l[i] < min: min = l[i] minidx = i return minidx def FindMinBranchingLetter(wordlist, alphabet): branchlist = [set() for i in range(len(alphabet))] for word in wordlist: hist = GetHistogram(word, alphabet) for i in range(len(hist)): branchlist[i].add(hist[i]) branchcounts = [len(a) for a in branchlist] return ArgMin(branchcounts) class DynamicSubsetTreeBuilder(object): def __init__(self): self.words = list() def AddString(self, s): self.words.append(s.strip().lower()) def AddStrings(self, f): self.words.extend(s.strip().lower() for s in f) def Build(self): tree = DynamicTupleify(DictTreeNode(self.words, DynamicSubsetTree.alphabet)) return DynamicSubsetTree(tree) class DynamicSubsetTree(object): alphabet = "abcdefghijklmnopqrstuvwxyz" def __init__(self, tree): self.tree = tree def Lookup(self, word): hist = GetHistogram(word.strip().lower(), DynamicSubsetTree.alphabet) return self.LookupHistogram(hist) def LookupHistogram(self, hist): nodes = [self.tree] ret = [] while len(nodes) > 0: newnodes = list() for node in nodes: letter = node[0] if letter is None: ret.extend(node[1:]) else: ind = DynamicSubsetTree.alphabet.find(letter) newnodes.extend(node[i] for i in range(1, min(hist[ind] + 2, len(node))) if node[i] is not None) nodes = newnodes return ret def DynamicTupleify(tree): while isinstance(tree, DictTreeNode) and len(tree) == 1 and 0 in tree: tree = tree[0] if isinstance(tree, DictTreeNode): count = max(tree.keys()) + 1 subtree = tuple(DynamicTupleify(tree.get(i, None)) for i in range(count)) return (tree.letter,) + subtree elif isinstance(tree, list): return (None,) + tuple(tree) else: return tree