diff --git a/CHANGELOG.md b/CHANGELOG.md index a17af40d078b63a73445bbb1c79c6b0df8b9ee9d..bed2a03a36eefc1979c3ed020badd921cb069b28 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,7 +4,14 @@ This changelog follows the specifications detailed in: [Keep a Changelog](https: This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html), although we have not yet reached a `1.0.0` release. -## Version 0.5.11 - Unreleased +## Version 0.5.12 - Unreleased + +### Fixed + +* Included the `_nx_ext_v2` files in setup.py + + +## Version 0.5.11 - Released 2020-12-02 ### Added diff --git a/netharn/__init__.py b/netharn/__init__.py index a93725062f4c7802d1ef75670fed8ab7339fcd1e..d1503f9a946ef008699a897156a0affaec638e33 100644 --- a/netharn/__init__.py +++ b/netharn/__init__.py @@ -4,7 +4,7 @@ mkinit netharn --noattrs --dry mkinit netharn --noattrs """ -__version__ = '0.5.11' +__version__ = '0.5.12' try: # PIL 7.0.0 removed PIL_VERSION, which breaks torchvision, monkey patch it diff --git a/netharn/initializers/_nx_ext/__init__.py b/netharn/initializers/_nx_ext/__init__.py deleted file mode 100644 index c0e9e1bf22e4d816d96b0361ab2c62f29eb4f89f..0000000000000000000000000000000000000000 --- a/netharn/initializers/_nx_ext/__init__.py +++ /dev/null @@ -1,76 +0,0 @@ -""" -TEMPORARY FORK --------------- - -CommandLine: - sedr networkx.algorithms.isomorphism._embedding netharn.initializers._nx_ext - sedr netharn.initializers._nx_ext netharn.initializers._nx_ext * True - - -Subpackages for helpers and such related to the ordered subtree embedding / -isomorphism problems. - -Contains routines for solving balanced sequence and path subproblems. Only the -final graph-based API is exposed, but modification to the internals (is / will -be) available via keyword arguments. - -balanced_sequence.py - core python implementations for the longest common -balanced sequence subproblem. - -balanced_sequence_cython.pyx - -faster alternative implementsions for balanced_sequence.py - -tree_embedding.py - defines reduction from tree problem to balanced sequence -problems. - -path_embedding.py - defines reduction from path problem to tree problem (not -core, this is just useful for testing among other things). - -demodata.py - Contains data for docstrings, benchmarks, and synthetic problems - - -Outstanding Issues ------------------- -- [ ] Multiple implementations of the algorithm backend / data structure - reduction, need to reduce the impelmentation and / or determine a better - mechansim for allowing the user to switch between them. - -- [ ] strhack is not a good API in `tree_to_seq` - -- [ ] Should we return which edges were contracted in each tree to create the - embeddings? That seems useful (but maybe not equivalent to the embeddings - themselves?) - -- [ ] How to deal with cython + networkx? Do we need to fix that skbuild with - pypy? - -- [ ] The open_to_node problem: - Note, we may be able to simply use the position of each opening token - as a proxy for unique tokens. Pass in an ordered list of nodes, then - just use their indexes. - - -CommandLine ------------ -xdoctest -m netharn.initializers._nx_ext list -xdoctest -m netharn.initializers._nx_ext all - -# Run all tests in this module -DPATH=$(python -c " -import os; import netharn.initializers._nx_ext as m; -print(os.path.dirname(m.__file__))") -pytest --xdoctest $DPATH --xdoc-analysis=dynamic - -# The mkinit tool helps autogenerate explicit `__init__.py` files -mkinit ~/code/networkx/netharn.initializers._nx_ext/__init__.py -w -""" - -__submodules__ = [ - 'tree_embedding', -] - -# from netharn.initializers._nx_ext import tree_embedding -from netharn.initializers._nx_ext.tree_embedding import ( - maximum_common_ordered_tree_embedding) - -__all__ = ['maximum_common_ordered_tree_embedding'] diff --git a/netharn/initializers/_nx_ext/_bseq_expt.py b/netharn/initializers/_nx_ext/_bseq_expt.py deleted file mode 100644 index 5fad7de9cd800f7b8cf3baa9eb6731f0f334e2e5..0000000000000000000000000000000000000000 --- a/netharn/initializers/_nx_ext/_bseq_expt.py +++ /dev/null @@ -1,392 +0,0 @@ -from netharn.initializers._nx_ext.balanced_sequence import UnbalancedException, IdentityDict # NOQA -from netharn.initializers._nx_ext.balanced_sequence import generate_all_decomp, _cython_lcs_backend, _lcs_iter_simple_alt2, _lcs_iter_prehash2, _lcs_recurse, _lcs_iter_simple, _lcs_iter_simple_alt1, _lcs_iter_prehash # NOQA - - -def _lcs_iter_simple_alt3(full_seq1, full_seq2, open_to_close, node_affinity, open_to_node): - """ - Depth first stack trajectory and replace try except statements with ifs - - This is the current best pure-python algorithm candidate - - >>> full_seq1 = '{({})([[]([]){(()(({()[]({}{})}))){}}])}' - >>> full_seq2 = '{[({{}}{{[][{}]}(()[(({()})){[]()}])})]}' - >>> open_to_close = {'{': '}', '(': ')', '[': ']'} - >>> full_seq1 = '[][[]][]' - >>> full_seq2 = '[[]][[]]' - >>> open_to_close = {'[': ']'} - >>> import operator as op - >>> node_affinity = op.eq - >>> open_to_node = IdentityDict() - >>> res = _lcs_iter_simple_alt3(full_seq1, full_seq2, open_to_close, node_affinity, open_to_node) - >>> embeddings, val, delseq = res - >>> print('embeddings = {!r}'.format(embeddings[0])) - >>> print('delseq = {!r}'.format(delseq[0])) - """ - all_decomp1 = generate_all_decomp(full_seq1, open_to_close, open_to_node) - all_decomp2 = generate_all_decomp(full_seq2, open_to_close, open_to_node) - - key0 = (full_seq1, full_seq2) - frame0 = key0 - stack = [frame0] - - # Memoize mapping (seq1, seq2) -> best size, embeddings, deleted edges - _results = {} - - # Populate base cases - empty1 = type(next(iter(all_decomp1.keys())))() - empty2 = type(next(iter(all_decomp2.keys())))() - best = (empty1, empty2) - base_result = (0, best, ([], [])) - for seq1 in all_decomp1.keys(): - key1 = seq1 - t1, a1, b1, head1, tail1, head_tail1 = all_decomp1[key1] - _results[(seq1, empty2)] = base_result - _results[(head1, empty2)] = base_result - _results[(tail1, empty2)] = base_result - _results[(head_tail1, empty2)] = base_result - - for seq2 in all_decomp2.keys(): - key2 = seq2 - t2, a2, b2, head2, tail2, head_tail2 = all_decomp2[key2] - _results[(empty1, seq2)] = base_result - _results[(empty1, head2)] = base_result - _results[(empty1, tail2)] = base_result - _results[(empty1, head_tail2)] = base_result - - del frame0 - del empty1 - del empty2 - del best - del base_result - - while stack: - key = stack[-1] - if key not in _results: - seq1, seq2 = key - - t1, a1, b1, head1, tail1, head_tail1 = all_decomp1[seq1] - t2, a2, b2, head2, tail2, head_tail2 = all_decomp2[seq2] - - # Case 2: The current edge in sequence1 is deleted - try_key = (head_tail1, seq2) - if try_key in _results: - cand1 = _results[try_key] - x, y, z = cand1 - z1, z2 = z - z1 = z1 + [a1] - z2 = z2 + [a2] - z3 = (z1, z2) - cand1 = (x, y, z3) - else: - # stack.append(key) - stack.append(try_key) - continue - - # Case 3: The current edge in sequence2 is deleted - try_key = (seq1, head_tail2) - if try_key in _results: - cand2 = _results[try_key] - x, y, z = cand2 - z1, z2 = z - z1 = z1 + [a1] - z2 = z2 + [a2] - z3 = (z1, z2) - cand2 = (x, y, z3) - else: - # stack.append(key) - stack.append(try_key) - continue - - # Case 1: The LCS involves this edge - affinity = node_affinity(t1, t2) - if affinity: - try_key = (head1, head2) - if try_key in _results: - pval_h, new_heads, delseq_h = _results[try_key] - else: - # stack.append(key) - stack.append(try_key) - continue - - try_key = (tail1, tail2) - if try_key in _results: - pval_t, new_tails, delseq_t = _results[try_key] - else: - # stack.append(key) - stack.append(try_key) - continue - - new_head1, new_head2 = new_heads - new_tail1, new_tail2 = new_tails - - subseq1 = a1 + new_head1 + b1 + new_tail1 - subseq2 = a2 + new_head2 + b2 + new_tail2 - - res3 = (subseq1, subseq2) - val3 = pval_h + pval_t + affinity - - h1, h2 = delseq_h - t1, t2 = delseq_t - - delseq3 = (h1 + t1, h2 + t2) - cand3 = (val3, res3, delseq3) - else: - cand3 = (-1, None) - - # We solved the frame - _results[key] = max(cand1, cand2, cand3) - stack.pop() - - val, best, delseq = _results[key0] - found = (best, val, delseq) - return found - - -def balanced_decomp2(sequence, open_to_close, start=0): - gen = generate_balance2(sequence, open_to_close) - for tup in gen: - (bal_curr, tok_curr, idx1, idx2) = tup - if idx2 == start: - stop = idx1 - assert bal_curr - break - - return start, stop - # pop_open = sequence[0:1] - # pop_close = sequence[head_stop:head_stop + 1] - # head = sequence[1:head_stop] - # tail = sequence[head_stop + 1:] - # head_tail = head + tail - # return pop_open, pop_close, head, tail, head_tail - - -def generate_balance2(sequence, open_to_close, start=0): - """ - Alternate version that also returns index information - - Yields - ------ - bool, T, int, int - is balanced - opening token - opening token index - current token index - - - Example - ------- - >>> open_to_close = {0: 1} - >>> seq = sequence = [0, 0, 0, 1, 1, 1, 0, 1] - >>> gen = list(generate_balance2(sequence, open_to_close)) - >>> for flag, token, idx1, idx2 in gen: - >>> print('flag={:d}, token={}, {}, {}'.format(flag, token, idx1, idx2)) - - balanced_decomp2(sequence, open_to_close) - """ - stack = [] - # Traversing the Expression - for curr_idx, token in enumerate(sequence, start=start): - - if token in open_to_close: - # Push opening elements onto the stack - stack.append((token, curr_idx)) - open_idx = -1 - else: - # Check that closing elements - if not stack: - raise UnbalancedException - prev_open, open_idx = stack.pop() - want_close = open_to_close[prev_open] - - if token != want_close: - raise UnbalancedException - - # If the stack is empty the sequence is currently balanced - currently_balanced = not bool(stack) - yield currently_balanced, token, curr_idx, open_idx - - if stack: - raise UnbalancedException - - -def generate_all_decomp2(full_seq, open_to_close, open_to_node=None): - """ - Alternate version where we keep track of indices instead - - Example - ------- - >>> full_seq = '0010010010111101' - >>> open_to_close = {'0': '1'} - >>> full_seq = '{[{}]}[()]' - >>> open_to_close = {'[': ']', '{': '}', '(': ')'} - >>> list(generate_balance2(full_seq, open_to_close)) - >>> all_decomp = generate_all_decomp2(full_seq, open_to_close) - - >>> from netharn.initializers._nx_ext import demodata - >>> full_seq, open_to_close = demodata.random_balanced_sequence(5, mode='number') - >>> all_decomp = generate_all_decomp2(full_seq, open_to_close) - """ - if open_to_node is None: - open_to_node = IdentityDict() - all_decomp = {} - - start = 0 - stop = len(full_seq) - deleted = [] - stack = [ - ('f', full_seq, start, stop, deleted) - ] - - DEBUG = 1 - - while stack: - t, seq, seq_start, seq_stop, seq_del = stack.pop() - if DEBUG: - import ubelt as ub - print('-----') - print(list(full_seq)) - - isdel = ['X' if b else ' ' for b in ub.boolmask(seq_del, len(full_seq))] - sep = ' : ' - pos = list(' ' * len(full_seq)) - pos[seq_start] = 'S' - pos[seq_stop - 1] = 'T' - prefix = ': ' - def padjoin(s): - return sep.join(['{:>2}'.format(c) for c in s]) - print(prefix + padjoin(range(len(full_seq)))) - print(prefix + padjoin(full_seq) + ' <- full_seq') - print(prefix + padjoin(isdel) + ' <- seq_del') - print(prefix + padjoin(pos) + ' <- seq_start, seq_stop') - - val = seq_start, seq_stop, seq_del - print('seq = {}, {!r}, {}'.format(t, seq, val)) - base = full_seq[seq_start:seq_stop] - print('base = {!r}'.format(base)) - rel_pad_del = [idx - seq_start for idx in seq_del if idx >= seq_start] - keep_idxs = sorted(set(range(len(base))) - set(rel_pad_del)) - newlist = [base[idx] for idx in keep_idxs] - try: - recon = ''.join(newlist) - except TypeError: - recon = tuple(newlist) - print('recon = {!r}'.format(recon)) - if seq: - rel_start, rel_stop = balanced_decomp2(seq, open_to_close) - - rel_head_start = rel_start + 1 - rel_head_stop = rel_stop - rel_tail_start = rel_stop + 1 - rel_tail_stop = len(seq) - if DEBUG > 1: - print('rel_start = {!r}'.format(rel_start)) - print('rel_stop = {!r}'.format(rel_stop)) - print('rel_head_start = {!r}'.format(rel_head_start)) - print('rel_head_stop = {!r}'.format(rel_head_stop)) - print('rel_tail_start = {!r}'.format(rel_tail_start)) - print('rel_tail_stop = {!r}'.format(rel_tail_stop)) - - rel_pad_del = [idx - seq_start for idx in seq_del if seq_start <= idx <= seq_stop] - if DEBUG: - print('rel_pad_del = {!r}'.format(rel_pad_del)) - - # I think there is a cumsum way of doing this, I'm being dense atm - # seq = '3' * 10 - # rel_pad_del = [4, 5, 9, 11] - hack_map = list(range(1 + len(seq) + len(rel_pad_del))) - for idx in sorted(rel_pad_del, reverse=True): - del hack_map[idx] - - if DEBUG: - print('hack_map = {!r}'.format(hack_map)) - - # I believe it is the case that the deleted indexes will only be - # able to cause a shift in the abs_tail_stop, the abs_tail_start, - # abs_head_stop, and abs_head_start should never "conflict" with - # the deleted indexes (I think). - - # num_del_after_tail_start = sum(abs_tail_start <= i <= seq_stop for i in seq_del) - # print('num_del_after_tail_start = {!r}'.format(num_del_after_tail_start)) - # num_del_before_tail_start = sum(0 <= i <= rel_tail_stop for i in rel_pad_del) - - abs_head_start = hack_map[rel_head_start] + seq_start - abs_head_stop = hack_map[rel_head_stop] + seq_start - - abs_tail_start = hack_map[rel_tail_start] + seq_start - abs_tail_stop = hack_map[rel_tail_stop] + seq_start - - if DEBUG > 1: - print('abs_head_start = {!r}'.format(abs_head_start)) - print('abs_head_stop = {!r}'.format(abs_head_stop)) - - print('abs_tail_start = {!r}'.format(abs_tail_start)) - print('abs_tail_stop = {!r}'.format(abs_tail_stop)) - - head_sl = slice(rel_head_start, rel_head_stop) - tail_sl = slice(rel_tail_start, rel_tail_stop) - - head = seq[head_sl] - tail = seq[tail_sl] - head_tail = head + tail - - head_del = seq_del - tail_del = seq_del - - if abs_head_stop == abs_head_start: - # case where tail is empty (which head_tail doesnt matter - # anyway but this is just a POC - abs_head_tail_start = abs_tail_start - else: - abs_head_tail_start = abs_head_start - - if abs_tail_stop == abs_tail_start: - # case where tail is empty (which head_tail doesnt matter - # anyway but this is just a POC - abs_head_tail_stop = abs_head_stop - else: - abs_head_tail_stop = abs_tail_stop - - abs_del_start = seq_start + rel_start - abs_del_stop = seq_start + rel_stop - - # head_tail_del = [abs_del_start, abs_del_stop] + seq_del - assert abs_del_start < abs_head_tail_start - if abs_del_stop < abs_head_tail_stop: - head_tail_del = [abs_del_stop] + seq_del - else: - head_tail_del = seq_del - - # seq[head_sl] + seq[tail_sl] - - # pop_open, pop_close, head, tail, head_tail = balanced_decomp2(seq, open_to_close) - # node = open_to_node[pop_open[0]] - all_decomp[seq] = (seq_start, seq_stop, seq_del) - # (node, pop_open, pop_close, head, tail, head_tail) - - if abs_head_stop > len(full_seq): - raise AssertionError - if abs_tail_stop > len(full_seq): - raise AssertionError - if abs_head_tail_stop > len(full_seq): - raise AssertionError - - if head: - if DEBUG: - print('head = {!r}'.format(head)) - head_del = [i for i in head_del if abs_head_start <= i < abs_head_stop] - stack.append(('h', head, abs_head_start, abs_head_stop, head_del)) - if tail: - if DEBUG: - print('tail = {!r}'.format(tail)) - tail_del = [i for i in tail_del if abs_tail_start <= i < abs_tail_stop] - stack.append(('t', tail, abs_tail_start, abs_tail_stop, tail_del)) - if tail and head: - if DEBUG: - print('head_tail = {!r}'.format(head_tail)) - print('head_tail_del = {!r}'.format(head_tail_del)) - head_tail_del = [i for i in head_tail_del if abs_head_tail_start <= i < abs_head_tail_stop] - stack.append(('ht', head_tail, abs_head_tail_start, abs_head_tail_stop, head_tail_del)) - if DEBUG: - assert seq == recon - - return all_decomp diff --git a/netharn/initializers/_nx_ext/balanced_sequence.py b/netharn/initializers/_nx_ext/balanced_sequence.py deleted file mode 100644 index 576bdda33dd86601ee6fd5ebf9593ca83e16ea05..0000000000000000000000000000000000000000 --- a/netharn/initializers/_nx_ext/balanced_sequence.py +++ /dev/null @@ -1,1146 +0,0 @@ -""" -Balanced sequences are used via reduction to solve the maximum common subtree -embedding problem. -""" -import operator - - -def longest_common_balanced_sequence( - seq1, seq2, open_to_close, open_to_node=None, - node_affinity='auto', impl='iter-prehash2'): - """ - Finds the longest common balanced sequence between two sequences - - Parameters - ---------- - seq1, seq2: Iterable - two input balanced sequences - - open_to_close : Dict - a mapping from opening to closing tokens in the balanced sequence - - open_to_node : Dict | None - a dictionary that maps a sequence token to a token corresponding to an - original problem (e.g. a tree node), if unspecified an identity mapping - is assumed. FIXME: see outstanding issues. - WILL LIKELY CHANGE IN THE FUTURE - - node_affinity : None | str | callable - Function for to determine if two nodes can be matched. The return is - interpreted as a weight that is used to break ties. If None then any - node can match any other node and only the topology is important. - The default is "eq", which is the same as ``operator.eq``. - - impl : str - Determines the backend implementation. There are currently 8 different - backend implementations: - - recurse, iter, iter-prehash, iter-prehash2, iter-alt1, iter-alt2, - iter-alt2-cython, and iter-prehash2-cython. - - Example - ------- - >>> # extremely simple case - >>> seq1 = '[][[]][]' - >>> seq2 = '[[]][[]]' - >>> open_to_close = {'[': ']'} - >>> best, value = longest_common_balanced_sequence(seq1, seq2, open_to_close) - >>> subseq1, subseq2 = best - >>> print('subseq1 = {!r}'.format(subseq1)) - subseq1 = '[][[]]' - - >>> # 1-label case from the paper (see Example 5) - >>> # https://pdfs.semanticscholar.org/0b6e/061af02353f7d9b887f9a378be70be64d165.pdf - >>> seq1 = '0010010010111100001011011011' - >>> seq2 = '001000101101110001000100101110111011' - >>> open_to_close = {'0': '1'} - >>> best, value = longest_common_balanced_sequence(seq1, seq2, open_to_close) - >>> subseq1, subseq2 = best - >>> print('subseq1 = {!r}'.format(subseq1)) - >>> assert value == 13 - subseq1 = '00100101011100001011011011' - - >>> # 3-label case - >>> seq1 = '{({})([[]([]){(()(({()[]({}{})}))){}}])}' - >>> seq2 = '{[({{}}{{[][{}]}(()[(({()})){[]()}])})]}' - >>> open_to_close = {'{': '}', '(': ')', '[': ']'} - >>> best, value = longest_common_balanced_sequence(seq1, seq2, open_to_close) - >>> subseq1, subseq2 = best - >>> print('subseq1 = {!r}'.format(subseq1)) - >>> assert value == 10 - subseq1 = '{{}[][]()(({()})){}}' - """ - if node_affinity == 'auto' or node_affinity == 'eq': - node_affinity = operator.eq - if node_affinity is None: - def _matchany(a, b): - return True - node_affinity = _matchany - if open_to_node is None: - open_to_node = IdentityDict() - full_seq1 = seq1 - full_seq2 = seq2 - if impl == 'auto': - if _cython_lcs_backend(): - impl = 'iter-alt2-cython' - else: - impl = 'iter-alt2' - - if impl == 'recurse': - _memo = {} - _seq_memo = {} - best, value = _lcs_recurse( - full_seq1, full_seq2, open_to_close, node_affinity, open_to_node, - _memo, _seq_memo) - elif impl == 'iter': - best, value = _lcs_iter_simple( - full_seq1, full_seq2, open_to_close, node_affinity, open_to_node) - elif impl == 'iter-prehash': - best, value = _lcs_iter_prehash( - full_seq1, full_seq2, open_to_close, node_affinity, open_to_node) - elif impl == 'iter-prehash2': - best, value = _lcs_iter_prehash2( - full_seq1, full_seq2, open_to_close, node_affinity, open_to_node) - elif impl == 'iter-alt1': - best, value = _lcs_iter_simple_alt1( - full_seq1, full_seq2, open_to_close, node_affinity, open_to_node) - elif impl == 'iter-alt2': - best, value = _lcs_iter_simple_alt2( - full_seq1, full_seq2, open_to_close, node_affinity, open_to_node) - elif impl == 'iter-alt2-cython': - balanced_sequence_cython = _cython_lcs_backend(error='raise') - best, value = balanced_sequence_cython._lcs_iter_simple_alt2_cython( - full_seq1, full_seq2, open_to_close, node_affinity, open_to_node) - elif impl == 'iter-prehash2-cython': - balanced_sequence_cython = _cython_lcs_backend(error='raise') - best, value = balanced_sequence_cython._lcs_iter_prehash2_cython( - full_seq1, full_seq2, open_to_close, node_affinity, open_to_node) - else: - raise KeyError(impl) - return best, value - - -def available_impls_longest_common_balanced_sequence(): - """ - Returns all available implementations for - :func:`longest_common_balanced_sequence`. - """ - from netharn.initializers._nx_ext import balanced_sequence - impls = [] - if balanced_sequence._cython_lcs_backend(): - impls += [ - 'iter-alt2-cython', - 'iter-prehash2-cython', - ] - - # Pure python backends - impls += [ - 'iter-prehash2', - 'iter-alt2', - 'iter-alt1', - 'iter-prehash', - 'iter', - 'recurse', - ] - return impls - - -def _cython_lcs_backend(error='ignore'): - """ - Returns the cython backend if available, otherwise None - """ - try: - from netharn.initializers._nx_ext import balanced_sequence_cython - except Exception: - if error == 'ignore': - return None - elif error == 'raise': - raise - else: - raise KeyError(error) - else: - return balanced_sequence_cython - - -def _lcs_iter_simple_alt2(full_seq1, full_seq2, open_to_close, node_affinity, open_to_node): - """ - Depth first stack trajectory and replace try except statements with ifs - - This is the current best pure-python algorithm candidate - - >>> full_seq1 = '{({})([[]([]){(()(({()[]({}{})}))){}}])}' - >>> full_seq2 = '{[({{}}{{[][{}]}(()[(({()})){[]()}])})]}' - >>> open_to_close = {'{': '}', '(': ')', '[': ']'} - >>> full_seq1 = '[][[]][]' - >>> full_seq2 = '[[]][[]]' - >>> open_to_close = {'[': ']'} - >>> import operator as op - >>> node_affinity = op.eq - >>> open_to_node = IdentityDict() - >>> res = _lcs_iter_simple_alt2(full_seq1, full_seq2, open_to_close, node_affinity, open_to_node) - >>> val, embeddings = res - """ - all_decomp1 = generate_all_decomp(full_seq1, open_to_close, open_to_node) - all_decomp2 = generate_all_decomp(full_seq2, open_to_close, open_to_node) - - key0 = (full_seq1, full_seq2) - frame0 = key0 - stack = [frame0] - - # Memoize mapping (seq1, seq2) -> best size, embeddings, deleted edges - _results = {} - - # Populate base cases - empty1 = type(next(iter(all_decomp1.keys())))() - empty2 = type(next(iter(all_decomp2.keys())))() - best = (empty1, empty2) - base_result = (0, best) - for seq1 in all_decomp1.keys(): - key1 = seq1 - t1, a1, b1, head1, tail1, head_tail1 = all_decomp1[key1] - _results[(seq1, empty2)] = base_result - _results[(head1, empty2)] = base_result - _results[(tail1, empty2)] = base_result - _results[(head_tail1, empty2)] = base_result - - for seq2 in all_decomp2.keys(): - key2 = seq2 - t2, a2, b2, head2, tail2, head_tail2 = all_decomp2[key2] - _results[(empty1, seq2)] = base_result - _results[(empty1, head2)] = base_result - _results[(empty1, tail2)] = base_result - _results[(empty1, head_tail2)] = base_result - - del frame0 - del empty1 - del empty2 - del best - del base_result - - while stack: - key = stack[-1] - if key not in _results: - seq1, seq2 = key - - t1, a1, b1, head1, tail1, head_tail1 = all_decomp1[seq1] - t2, a2, b2, head2, tail2, head_tail2 = all_decomp2[seq2] - - # Case 2: The current edge in sequence1 is deleted - try_key = (head_tail1, seq2) - if try_key in _results: - cand1 = _results[try_key] - else: - # stack.append(key) - stack.append(try_key) - continue - - # Case 3: The current edge in sequence2 is deleted - try_key = (seq1, head_tail2) - if try_key in _results: - cand2 = _results[try_key] - else: - # stack.append(key) - stack.append(try_key) - continue - - # Case 1: The LCS involves this edge - affinity = node_affinity(t1, t2) - if affinity: - try_key = (head1, head2) - if try_key in _results: - pval_h, new_heads = _results[try_key] - else: - # stack.append(key) - stack.append(try_key) - continue - - try_key = (tail1, tail2) - if try_key in _results: - pval_t, new_tails = _results[try_key] - else: - # stack.append(key) - stack.append(try_key) - continue - - new_head1, new_head2 = new_heads - new_tail1, new_tail2 = new_tails - - subseq1 = a1 + new_head1 + b1 + new_tail1 - subseq2 = a2 + new_head2 + b2 + new_tail2 - - res3 = (subseq1, subseq2) - val3 = pval_h + pval_t + affinity - cand3 = (val3, res3) - else: - cand3 = (-1, None) - - # We solved the frame - _results[key] = max(cand1, cand2, cand3) - stack.pop() - - val, best = _results[key0] - found = (best, val) - return found - - -def _lcs_iter_prehash2(full_seq1, full_seq2, open_to_close, node_affinity, open_to_node): - """ - Version of the lcs iterative algorithm where we precompute hash values - - See :func:`longest_common_balanced_sequence` for parameter details. - """ - - all_decomp1 = generate_all_decomp_prehash(full_seq1, open_to_close, open_to_node) - all_decomp2 = generate_all_decomp_prehash(full_seq2, open_to_close, open_to_node) - - key_decomp1 = {} - key_decomp2 = {} - _results = {} - # Populate base cases - empty1 = type(next(iter(all_decomp1.keys())))() - empty2 = type(next(iter(all_decomp2.keys())))() - empty1_key = hash(empty1) - empty2_key = hash(empty2) - best = (empty1, empty2) - base_result = (0, best) - for seq1, info1 in all_decomp1.items(): - seq1_key = hash(seq1) - head1_key, tail1_key, head_tail1_key = all_decomp1[seq1][5:8] - _results[(seq1_key, empty2_key)] = base_result - _results[(head1_key, empty2_key)] = base_result - _results[(tail1_key, empty2_key)] = base_result - _results[(head_tail1_key, empty2_key)] = base_result - key_decomp1[seq1_key] = info1 - - for seq2, info2 in all_decomp2.items(): - seq2_key = hash(seq2) - head2_key, tail2_key, head_tail2_key = all_decomp2[seq2][5:8] - _results[(empty1_key, seq2_key)] = base_result - _results[(empty1_key, head2_key)] = base_result - _results[(empty1_key, tail2_key)] = base_result - _results[(empty1_key, head_tail2_key)] = base_result - key_decomp2[seq2_key] = info2 - - full_seq1_key = hash(full_seq1) - full_seq2_key = hash(full_seq2) - key0 = (full_seq1_key, full_seq2_key) - frame0 = key0, full_seq1, full_seq2 - stack = [frame0] - missing_frames = [] - while stack: - frame = stack[-1] - key, seq1, seq2 = frame - seq1_key, seq2_key = key - if key not in _results: - missing_frames.clear() - - info1 = key_decomp1[seq1_key] - tok1, seq1, head1, tail1, head_tail1, head1_key, tail1_key, head_tail1_key, a1, b1 = info1 - - # if seq2_key not in key_decomp2: - info2 = key_decomp2[seq2_key] - tok2, seq2, head2, tail2, head_tail2, head2_key, tail2_key, head_tail2_key, a2, b2 = info2 - - affinity = node_affinity(tok1, tok2) - - # Case 2: The current edge in sequence1 is deleted - try_key = (head_tail1_key, seq2_key) - if try_key in _results: - cand1 = _results[try_key] - else: - miss_frame = try_key, head_tail1, seq2 - stack.append(miss_frame) - continue - - # Case 3: The current edge in sequence2 is deleted - try_key = (seq1_key, head_tail2_key) - if try_key in _results: - cand2 = _results[try_key] - else: - miss_frame = try_key, seq1, head_tail2 - stack.append(miss_frame) - continue - - # Case 1: The LCS involves this edge - if affinity: - try_key = (head1_key, head2_key) - if try_key in _results: - pval_h, new_heads = _results[try_key] - else: - miss_frame = try_key, head1, head2 - stack.append(miss_frame) - continue - - try_key = (tail1_key, tail2_key) - if try_key in _results: - pval_t, new_tails = _results[try_key] - else: - miss_frame = try_key, tail1, tail2 - stack.append(miss_frame) - continue - - new_head1, new_head2 = new_heads - new_tail1, new_tail2 = new_tails - - subseq1 = a1 + new_head1 + b1 + new_tail1 - subseq2 = a2 + new_head2 + b2 + new_tail2 - - res3 = (subseq1, subseq2) - val3 = pval_h + pval_t + affinity - cand3 = (val3, res3) - else: - cand3 = (-1, None) - - # We solved the frame - _results[key] = max(cand1, cand2, cand3) - stack.pop() - - # The stack pop is our solution - (val, best) = _results[key0] - found = (best, val) - return found - - -def _lcs_recurse(seq1, seq2, open_to_close, node_affinity, open_to_node, _memo, _seq_memo): - """ - Surprisingly, this recursive implementation is one of the faster - pure-python methods for certain input types. However, its major drawback is - that it can raise a RecurssionError if the inputs are too deep. - """ - if not seq1: - return (seq1, seq1), 0 - elif not seq2: - return (seq2, seq2), 0 - else: - key1 = hash(seq1) # using hash(seq) is faster than seq itself - key2 = hash(seq2) - key = hash((key1, key2)) - if key in _memo: - return _memo[key] - - if key1 in _seq_memo: - a1, b1, head1, tail1, head1_tail1 = _seq_memo[key1] - else: - a1, b1, head1, tail1, head1_tail1 = balanced_decomp_unsafe(seq1, open_to_close) - _seq_memo[key1] = a1, b1, head1, tail1, head1_tail1 - - if key2 in _seq_memo: - a2, b2, head2, tail2, head2_tail2 = _seq_memo[key2] - else: - a2, b2, head2, tail2, head2_tail2 = balanced_decomp_unsafe(seq2, open_to_close) - _seq_memo[key2] = a2, b2, head2, tail2, head2_tail2 - - # Case 2: The current edge in sequence1 is deleted - best, val = _lcs_recurse(head1_tail1, seq2, open_to_close, node_affinity, open_to_node, _memo, _seq_memo) - - # Case 3: The current edge in sequence2 is deleted - cand, val_alt = _lcs_recurse(seq1, head2_tail2, open_to_close, node_affinity, open_to_node, _memo, _seq_memo) - if val_alt > val: - best = cand - val = val_alt - - # Case 1: The LCS involves this edge - t1 = open_to_node[a1[0]] - t2 = open_to_node[a2[0]] - affinity = node_affinity(t1, t2) - if affinity: - new_heads, pval_h = _lcs_recurse(head1, head2, open_to_close, node_affinity, open_to_node, _memo, _seq_memo) - new_tails, pval_t = _lcs_recurse(tail1, tail2, open_to_close, node_affinity, open_to_node, _memo, _seq_memo) - - new_head1, new_head2 = new_heads - new_tail1, new_tail2 = new_tails - - subseq1 = a1 + new_head1 + b1 + new_tail1 - subseq2 = a2 + new_head2 + b2 + new_tail2 - - cand = (subseq1, subseq2) - val_alt = pval_h + pval_t + affinity - if val_alt > val: - best = cand - val = val_alt - - found = (best, val) - _memo[key] = found - return found - - -def _lcs_iter_simple(full_seq1, full_seq2, open_to_close, node_affinity, open_to_node): - """ - Converts _lcs_recursive to an iterative algorithm using a fairly - straightforward method that effectivly simulates callstacks. - Uses a breadth-first trajectory and try-except to catch missing - memoized results (which seems to be slightly slower than if statements). - """ - all_decomp1 = generate_all_decomp(full_seq1, open_to_close, open_to_node) - all_decomp2 = generate_all_decomp(full_seq2, open_to_close, open_to_node) - - args0 = (full_seq1, full_seq2) - frame0 = args0 - stack = [frame0] - - _results = {} - # Populate base cases - empty1 = type(next(iter(all_decomp1.keys())))() - empty2 = type(next(iter(all_decomp2.keys())))() - best = (empty1, empty2) - base_result = (0, best) - for seq1 in all_decomp1.keys(): - key1 = seq1 - t1, a1, b1, head1, tail1, head_tail1 = all_decomp1[key1] - _results[(seq1, empty2)] = base_result - _results[(head1, empty2)] = base_result - _results[(tail1, empty2)] = base_result - _results[(head_tail1, empty2)] = base_result - - for seq2 in all_decomp2.keys(): - key2 = seq2 - t2, a2, b2, head2, tail2, head_tail2 = all_decomp2[key2] - _results[(empty1, seq2)] = base_result - _results[(empty1, head2)] = base_result - _results[(empty1, tail2)] = base_result - _results[(empty1, head_tail2)] = base_result - - del args0 - del frame0 - del empty1 - del empty2 - del best - del base_result - - missing_frames = [] - while stack: - key = stack.pop() - if key not in _results: - seq1, seq2 = key - missing_frames.clear() - - t1, a1, b1, head1, tail1, head_tail1 = all_decomp1[seq1] - t2, a2, b2, head2, tail2, head_tail2 = all_decomp2[seq2] - - # Case 2: The current edge in sequence1 is deleted - try: - try_key = (head_tail1, seq2) - cand1 = _results[try_key] - except KeyError: - missing_frames.append(try_key) - - # Case 3: The current edge in sequence2 is deleted - try: - try_key = (seq1, head_tail2) - cand2 = _results[try_key] - except KeyError: - missing_frames.append(try_key) - - # Case 1: The LCS involves this edge - affinity = node_affinity(t1, t2) - if affinity: - try: - try_key = (head1, head2) - pval_h, new_heads = _results[try_key] - except KeyError: - missing_frames.append(try_key) - - try: - try_key = (tail1, tail2) - pval_t, new_tails = _results[try_key] - except KeyError: - missing_frames.append(try_key) - - if not missing_frames: - new_head1, new_head2 = new_heads - new_tail1, new_tail2 = new_tails - - subseq1 = a1 + new_head1 + b1 + new_tail1 - subseq2 = a2 + new_head2 + b2 + new_tail2 - - res3 = (subseq1, subseq2) - val3 = pval_h + pval_t + affinity - cand3 = (val3, res3) - else: - cand3 = (-1, None) - - if missing_frames: - # We did not solve this frame yet - stack.append(key) - stack.extend(missing_frames) - # stack.extend(missing_frames[::-1]) - else: - # We solved the frame - _results[key] = max(cand1, cand2, cand3) - - val, best = _results[key] - found = (best, val) - return found - - -def _lcs_iter_simple_alt1(full_seq1, full_seq2, open_to_close, node_affinity, open_to_node): - """ - Depth first stack trajectory - """ - all_decomp1 = generate_all_decomp(full_seq1, open_to_close, open_to_node) - all_decomp2 = generate_all_decomp(full_seq2, open_to_close, open_to_node) - - args0 = (full_seq1, full_seq2) - frame0 = args0 - stack = [frame0] - - _results = {} - # Populate base cases - empty1 = type(next(iter(all_decomp1.keys())))() - empty2 = type(next(iter(all_decomp2.keys())))() - best = (empty1, empty2) - base_result = (0, best) - for seq1 in all_decomp1.keys(): - key1 = seq1 - t1, a1, b1, head1, tail1, head_tail1 = all_decomp1[key1] - _results[(seq1, empty2)] = base_result - _results[(head1, empty2)] = base_result - _results[(tail1, empty2)] = base_result - _results[(head_tail1, empty2)] = base_result - - for seq2 in all_decomp2.keys(): - key2 = seq2 - t2, a2, b2, head2, tail2, head_tail2 = all_decomp2[key2] - _results[(empty1, seq2)] = base_result - _results[(empty1, head2)] = base_result - _results[(empty1, tail2)] = base_result - _results[(empty1, head_tail2)] = base_result - - del args0 - del frame0 - del empty1 - del empty2 - del best - del base_result - - while stack: - key = stack.pop() - if key not in _results: - seq1, seq2 = key - - t1, a1, b1, head1, tail1, head_tail1 = all_decomp1[seq1] - t2, a2, b2, head2, tail2, head_tail2 = all_decomp2[seq2] - - # Case 2: The current edge in sequence1 is deleted - try: - try_key = (head_tail1, seq2) - cand1 = _results[try_key] - except KeyError: - stack.append(key) - stack.append(try_key) - continue - - # Case 3: The current edge in sequence2 is deleted - try: - try_key = (seq1, head_tail2) - cand2 = _results[try_key] - except KeyError: - stack.append(key) - stack.append(try_key) - continue - - # Case 1: The LCS involves this edge - affinity = node_affinity(t1, t2) - if affinity: - try: - try_key = (head1, head2) - pval_h, new_heads = _results[try_key] - except KeyError: - stack.append(key) - stack.append(try_key) - continue - - try: - try_key = (tail1, tail2) - pval_t, new_tails = _results[try_key] - except KeyError: - stack.append(key) - stack.append(try_key) - continue - - new_head1, new_head2 = new_heads - new_tail1, new_tail2 = new_tails - - subseq1 = a1 + new_head1 + b1 + new_tail1 - subseq2 = a2 + new_head2 + b2 + new_tail2 - - res3 = (subseq1, subseq2) - val3 = pval_h + pval_t + affinity - cand3 = (val3, res3) - else: - cand3 = (-1, None) - - # We solved the frame - _results[key] = max(cand1, cand2, cand3) - - val, best = _results[key] - found = (best, val) - return found - - -def _lcs_iter_prehash(full_seq1, full_seq2, open_to_close, node_affinity, open_to_node): - """ - Version of the lcs iterative algorithm where we precompute hash values. - Uses a breadth-first trajectory. - """ - all_decomp1 = generate_all_decomp_prehash(full_seq1, open_to_close, open_to_node) - all_decomp2 = generate_all_decomp_prehash(full_seq2, open_to_close, open_to_node) - - key_decomp1 = {} - key_decomp2 = {} - _results = {} - # Populate base cases - empty1 = type(next(iter(all_decomp1.keys())))() - empty2 = type(next(iter(all_decomp2.keys())))() - empty1_key = hash(empty1) - empty2_key = hash(empty2) - best = (empty1, empty2) - base_result = (0, best) - for seq1, info1 in all_decomp1.items(): - seq1_key = hash(seq1) - head1_key, tail1_key, head_tail1_key = all_decomp1[seq1][5:8] - _results[(seq1_key, empty2_key)] = base_result - _results[(head1_key, empty2_key)] = base_result - _results[(tail1_key, empty2_key)] = base_result - _results[(head_tail1_key, empty2_key)] = base_result - key_decomp1[seq1_key] = info1 - - for seq2, info2 in all_decomp2.items(): - seq2_key = hash(seq2) - head2_key, tail2_key, head_tail2_key = all_decomp2[seq2][5:8] - _results[(empty1_key, seq2_key)] = base_result - _results[(empty1_key, head2_key)] = base_result - _results[(empty1_key, tail2_key)] = base_result - _results[(empty1_key, head_tail2_key)] = base_result - key_decomp2[seq2_key] = info2 - - full_seq1_key = hash(full_seq1) - full_seq2_key = hash(full_seq2) - key0 = (full_seq1_key, full_seq2_key) - frame0 = key0, full_seq1, full_seq2 - stack = [frame0] - missing_frames = [] - while stack: - frame = stack.pop() - key, seq1, seq2 = frame - seq1_key, seq2_key = key - if key not in _results: - missing_frames.clear() - - try: - info1 = key_decomp1[seq1_key] - except KeyError: - info1 = balanced_decomp_prehash(seq1, open_to_close) - key_decomp1[seq1_key] = info1 - tok1, seq1, head1, tail1, head_tail1, head1_key, tail1_key, head_tail1_key, a1, b1 = info1 - - try: - info2 = key_decomp2[seq2_key] - except KeyError: - info2 = balanced_decomp_prehash(seq2, open_to_close) - key_decomp2[seq2_key] = info2 - tok2, seq2, head2, tail2, head_tail2, head2_key, tail2_key, head_tail2_key, a2, b2 = info2 - - affinity = node_affinity(tok1, tok2) - - # Case 2: The current edge in sequence1 is deleted - try: - try_key = (head_tail1_key, seq2_key) - cand1 = _results[try_key] - except KeyError: - miss_frame = try_key, head_tail1, seq2 - missing_frames.append(miss_frame) - - # Case 3: The current edge in sequence2 is deleted - try: - try_key = (seq1_key, head_tail2_key) - cand2 = _results[try_key] - except KeyError: - miss_frame = try_key, seq1, head_tail2 - missing_frames.append(miss_frame) - - # Case 1: The LCS involves this edge - if affinity: - try: - try_key = (head1_key, head2_key) - pval_h, new_heads = _results[try_key] - except KeyError: - miss_frame = try_key, head1, head2 - missing_frames.append(miss_frame) - - try: - try_key = (tail1_key, tail2_key) - pval_t, new_tails = _results[try_key] - except KeyError: - miss_frame = try_key, tail1, tail2 - missing_frames.append(miss_frame) - - if not missing_frames: - new_head1, new_head2 = new_heads - new_tail1, new_tail2 = new_tails - - subseq1 = a1 + new_head1 + b1 + new_tail1 - subseq2 = a2 + new_head2 + b2 + new_tail2 - - res3 = (subseq1, subseq2) - val3 = pval_h + pval_t + affinity - cand3 = (val3, res3) - else: - cand3 = (-1, None) - - if missing_frames: - # We did not solve this frame yet - stack.append(frame) - stack.extend(missing_frames[::-1]) - else: - # We solved the frame - _results[key] = max(cand1, cand2, cand3) - - # The stack pop is our solution - (val, best) = _results[key] - found = (best, val) - return found - - -class UnbalancedException(Exception): - """ - Denotes that a sequence was unbalanced - """ - pass - - -class IdentityDict: - """ - Used when ``open_to_node`` is unspecified - """ - def __getitem__(self, key): - return key - - -def generate_all_decomp(seq, open_to_close, open_to_node=None): - """ - Generates all decompositions of a single balanced sequence by - recursive decomposition of the head, tail, and head|tail. - - Parameters - ---------- - seq : Tuple | str - a tuple of hashable items or a string where each character is an item - - open_to_close : Dict - a dictionary that maps opening tokens to closing tokens in the balanced - sequence problem. - - open_to_node : Dict - a dictionary that maps a sequence token to a token corresponding to an - original problem (e.g. a tree node) - - Returns - ------- - Dict : mapping from a sub-sequence to its decomposition - - Notes - ----- - In the paper: See Definition 2, 4, Lemma, 1, 2, 3, 4. - - Example - ------- - >>> # Example 2 in the paper (one from each column) - >>> seq = '00100100101111' - >>> open_to_close = {'0': '1'} - >>> all_decomp = generate_all_decomp(seq, open_to_close) - >>> assert len(all_decomp) == len(seq) // 2 - >>> import pprint - >>> pprint.pprint(all_decomp) - {'00100100101111': ('0', '0', '1', '010010010111', '', '010010010111'), - '0010010111': ('0', '0', '1', '01001011', '', '01001011'), - '001011': ('0', '0', '1', '0101', '', '0101'), - '01': ('0', '0', '1', '', '', ''), - '010010010111': ('0', '0', '1', '', '0010010111', '0010010111'), - '01001011': ('0', '0', '1', '', '001011', '001011'), - '0101': ('0', '0', '1', '', '01', '01')} - - Example - ------- - >>> open_to_close = {'{': '}', '(': ')', '[': ']'} - >>> seq = '({[[]]})[[][]]{{}}' - >>> all_decomp = generate_all_decomp(seq, open_to_close) - >>> node, *decomp = all_decomp[seq] - >>> pop_open, pop_close, head, tail, head_tail = decomp - >>> print('node = {!r}'.format(node)) - >>> print('pop_open = {!r}'.format(pop_open)) - >>> print('pop_close = {!r}'.format(pop_close)) - >>> print('head = {!r}'.format(head)) - >>> print('tail = {!r}'.format(tail)) - >>> print('head_tail = {!r}'.format(head_tail)) - node = '(' - pop_open = '(' - pop_close = ')' - head = '{[[]]}' - tail = '[[][]]{{}}' - head_tail = '{[[]]}[[][]]{{}}' - >>> decomp_alt = balanced_decomp(seq, open_to_close) - >>> assert decomp_alt == tuple(decomp) - - Example - ------- - >>> from netharn.initializers._nx_ext.demodata import random_balanced_sequence - >>> seq, open_to_close = random_balanced_sequence(10) - >>> all_decomp = generate_all_decomp(seq, open_to_close) - """ - if open_to_node is None: - open_to_node = IdentityDict() - all_decomp = {} - stack = [seq] - while stack: - seq = stack.pop() - if seq not in all_decomp and seq: - pop_open, pop_close, head, tail, head_tail = balanced_decomp(seq, open_to_close) - node = open_to_node[pop_open[0]] - all_decomp[seq] = (node, pop_open, pop_close, head, tail, head_tail) - if head: - if tail: - stack.append(head_tail) - stack.append(tail) - stack.append(head) - elif tail: - stack.append(tail) - return all_decomp - - -def balanced_decomp(sequence, open_to_close): - """ - Generates a decomposition of a balanced sequence. - - Parameters - ---------- - sequence : str - balanced sequence to be decomposed - - open_to_close: dict - a dictionary that maps opening tokens to closing tokens in the balanced - sequence problem. - - Returns - ------- - : tuple[T, T, T, T, T] - where ``T = type(sequence)`` - Contents of this tuple are: - - 0. a1 - a sequence of len(1) containing the current opening token - 1. b1 - a sequence of len(1) containing the current closing token - 2. head - head of the sequence - 3. tail - tail of the sequence - 4. head_tail - the concatanted head and tail - - Example - ------- - >>> # Example 3 from the paper - >>> sequence = '001000101101110001000100101110111011' - >>> open_to_close = {'0': '1'} - >>> a1, b1, head, tail, head_tail = balanced_decomp(sequence, open_to_close) - >>> print('head = {!r}'.format(head)) - >>> print('tail = {!r}'.format(tail)) - head = '010001011011' - tail = '0001000100101110111011' - - Example - ------- - >>> open_to_close = {0: 1} - >>> sequence = [0, 0, 0, 1, 1, 1, 0, 1] - >>> a1, b1, head, tail, head_tail = balanced_decomp(sequence, open_to_close) - >>> print('a1 = {!r}'.format(a1)) - >>> print('b1 = {!r}'.format(b1)) - >>> print('head = {!r}'.format(head)) - >>> print('tail = {!r}'.format(tail)) - >>> print('head_tail = {!r}'.format(head_tail)) - a1 = [0] - b1 = [1] - head = [0, 0, 1, 1] - tail = [0, 1] - head_tail = [0, 0, 1, 1, 0, 1] - >>> a2, b2, tail1, tail2, head_tail2 = balanced_decomp(tail, open_to_close) - - Example - ------- - >>> open_to_close = {'{': '}', '(': ')', '[': ']'} - >>> sequence = '({[[]]})[[][]]' - >>> a1, b1, head, tail, head_tail = balanced_decomp(sequence, open_to_close) - >>> print('a1 = {!r}'.format(a1)) - >>> print('b1 = {!r}'.format(b1)) - >>> print('head = {!r}'.format(head)) - >>> print('tail = {!r}'.format(tail)) - >>> print('head_tail = {!r}'.format(head_tail)) - a1 = '(' - b1 = ')' - head = '{[[]]}' - tail = '[[][]]' - head_tail = '{[[]]}[[][]]' - >>> a2, b2, tail1, tail2, head_tail2 = balanced_decomp(tail, open_to_close) - >>> print('a2 = {!r}'.format(a2)) - >>> print('b2 = {!r}'.format(b2)) - >>> print('tail1 = {!r}'.format(tail1)) - >>> print('tail2 = {!r}'.format(tail2)) - >>> print('head_tail2 = {!r}'.format(head_tail2)) - a2 = '[' - b2 = ']' - tail1 = '[][]' - tail2 = '' - head_tail2 = '[][]' - """ - gen = generate_balance(sequence, open_to_close) - - bal_curr, tok_curr = next(gen) - pop_open = sequence[0:1] - want_close = open_to_close[tok_curr] - - head_stop = 1 - for head_stop, (bal_curr, tok_curr) in enumerate(gen, start=1): - if tok_curr is None: - break - elif bal_curr and tok_curr == want_close: - pop_close = sequence[head_stop:head_stop + 1] - break - head = sequence[1:head_stop] - tail = sequence[head_stop + 1:] - head_tail = head + tail - return pop_open, pop_close, head, tail, head_tail - - -def generate_balance(sequence, open_to_close): - """ - Iterates through a balanced sequence and reports if the sequence-so-far - is balanced at that position or not. - - Parameters - ---------- - sequence: List[Tuple] | str: - an input balanced sequence - - open_to_close : Dict - a mapping from opening to closing tokens in the balanced sequence - - Raises - ------ - UnbalancedException - if the input sequence is not balanced - - Yields - ------ - Tuple[bool, T]: - boolean indicating if the sequence is balanced at this index, - and the current token - - Example - ------- - >>> open_to_close = {0: 1} - >>> sequence = [0, 0, 0, 1, 1, 1] - >>> gen = list(generate_balance(sequence, open_to_close)) - >>> for flag, token in gen: - >>> print('flag={:d}, token={}'.format(flag, token)) - - Example - ------- - >>> from netharn.initializers._nx_ext.demodata import random_balanced_sequence - >>> sequence, open_to_close = random_balanced_sequence(4) - >>> print('sequence = {!r}'.format(sequence)) - >>> gen = list(generate_balance(sequence, open_to_close)) - >>> for flag, token in gen: - >>> print('flag={:d}, token={}'.format(flag, token)) - """ - stack = [] - # Traversing the Expression - for token in sequence: - - if token in open_to_close: - # Push opening elements onto the stack - stack.append(token) - else: - # Check that closing elements - if not stack: - raise UnbalancedException - prev_open = stack.pop() - want_close = open_to_close[prev_open] - - if token != want_close: - raise UnbalancedException - - # If the stack is empty the sequence is currently balanced - currently_balanced = not bool(stack) - yield currently_balanced, token - - if stack: - raise UnbalancedException - - -def generate_all_decomp_prehash(seq, open_to_close, open_to_node): - """ - Like :func:`generate_all_decomp` but additionally returns the - precomputed hashes of the sequences. - """ - all_decomp = {} - stack = [seq] - while stack: - seq = stack.pop() - if seq: - # key = hash(seq) - key = seq - if key not in all_decomp: - info = balanced_decomp_prehash(seq, open_to_close, open_to_node) - head, tail, head_tail = info[2:5] - all_decomp[key] = info - stack.append(head_tail) - stack.append(head) - stack.append(tail) - return all_decomp - - -def balanced_decomp_prehash(seq, open_to_close, open_to_node): - """ - Like :func:`balanced_decomp` but additionally returns the - precomputed hashes of the sequences. - """ - pop_open, pop_close, head, tail, head_tail = balanced_decomp_unsafe(seq, open_to_close) - head_key = hash(head) - tail_key = hash(tail) - head_tail_key = hash(head_tail) - node = open_to_node[pop_open[0]] - a = pop_open - b = pop_close - info = (node, seq, head, tail, head_tail, head_key, tail_key, head_tail_key, a, b) - return info - - -def balanced_decomp_unsafe(sequence, open_to_close): - """ - Same as :func:`balanced_decomp` but assumes that ``sequence`` is valid - balanced sequence in order to execute faster. - """ - gen = generate_balance_unsafe(sequence, open_to_close) - - bal_curr, tok_curr = next(gen) - pop_open = sequence[0:1] - want_close = open_to_close[tok_curr] - - head_stop = 1 - for head_stop, (bal_curr, tok_curr) in enumerate(gen, start=1): - if bal_curr and tok_curr == want_close: - pop_close = sequence[head_stop:head_stop + 1] - break - head = sequence[1:head_stop] - tail = sequence[head_stop + 1:] - head_tail = head + tail - return pop_open, pop_close, head, tail, head_tail - - -def generate_balance_unsafe(sequence, open_to_close): - """ - Same as :func:`generate_balance` but assumes that ``sequence`` is valid - balanced sequence in order to execute faster. - """ - stacklen = 0 - for token in sequence: - if token in open_to_close: - stacklen += 1 - else: - stacklen -= 1 - yield stacklen == 0, token diff --git a/netharn/initializers/_nx_ext/balanced_sequence_cython.pyx b/netharn/initializers/_nx_ext/balanced_sequence_cython.pyx deleted file mode 100644 index b9f0df5279b345aeb0fab4b00671363ff99d7e93..0000000000000000000000000000000000000000 --- a/netharn/initializers/_nx_ext/balanced_sequence_cython.pyx +++ /dev/null @@ -1,344 +0,0 @@ -# distutils: language = c++ -""" -This module re-implements functions in :module:`balanced_sequence` in cython -and obtains 40-50x speedups in common circumstances. There are likely more -speed improvements that could be made. - -CommandLine ------------ -# Explicitly build this cython module (must in networkx repo root) -cythonize -a -i networkx/algorithms/isomorphism/_embedding/balanced_sequence_cython.pyx - - -Examples --------- ->>> from networkx.algorithms.isomorphism._embedding.balanced_sequence_cython import _lcs_iter_prehash2_cython ->>> from networkx.algorithms.isomorphism._embedding.balanced_sequence_cython import _lcs_iter_simple_alt2_cython ->>> from networkx.algorithms.isomorphism._embedding.demodata import random_balanced_sequence ->>> seq1, open_to_close1 = random_balanced_sequence(300, mode='paren') ->>> seq2, open_to_close2 = random_balanced_sequence(300, mode='paren') ->>> open_to_close = {**open_to_close1, **open_to_close2} ->>> full_seq1 = seq1 ->>> full_seq2 = seq2 ->>> import operator ->>> node_affinity = operator.eq ->>> open_to_node = IdentityDict() ->>> best1, value1 = _lcs_iter_prehash2_cython(full_seq1, full_seq2, open_to_close, node_affinity, open_to_node) ->>> best2, value2 = _lcs_iter_simple_alt2_cython(full_seq1, full_seq2, open_to_close, node_affinity, open_to_node) ->>> assert value1 == value1 -""" - - -def _lcs_iter_prehash2_cython(full_seq1, full_seq2, open_to_close, node_affinity, open_to_node): - """ - Version of the lcs iterative algorithm where we precompute hash values. - - This is the current fastest implementation candidate for the LCS problem, - but note that the alternative version is faster in some cases. - """ - cdef dict all_decomp1 = generate_all_decomp_prehash_cython(full_seq1, open_to_close, open_to_node) - cdef dict all_decomp2 = generate_all_decomp_prehash_cython(full_seq2, open_to_close, open_to_node) - cdef dict key_decomp1 = {} - cdef dict key_decomp2 = {} - - cdef dict _results = {} - # Populate base cases - empty1 = type(next(iter(all_decomp1.keys())))() - empty2 = type(next(iter(all_decomp2.keys())))() - cdef Py_hash_t empty1_key = hash(empty1) - cdef Py_hash_t empty2_key = hash(empty2) - cdef tuple best = (empty1, empty2) - - cdef tuple info1, info2 - cdef tuple try_key, key - cdef Py_hash_t seq1_key, seq2_key - cdef Py_hash_t head1_key, tail1_key, head_tail1_key - cdef Py_hash_t head2_key, tail2_key, head_tail2_key - cdef tuple frame - cdef tuple miss_frame - - base_result = (0, best) - for seq1, info1 in all_decomp1.items(): - seq1_key = hash(seq1) - head1_key, tail1_key, head_tail1_key = all_decomp1[seq1][5:8] - _results[(seq1_key, empty2_key)] = base_result - _results[(head1_key, empty2_key)] = base_result - _results[(tail1_key, empty2_key)] = base_result - _results[(head_tail1_key, empty2_key)] = base_result - key_decomp1[seq1_key] = info1 - - for seq2, info2 in all_decomp2.items(): - seq2_key = hash(seq2) - head2_key, tail2_key, head_tail2_key = all_decomp2[seq2][5:8] - _results[(empty1_key, seq2_key)] = base_result - _results[(empty1_key, head2_key)] = base_result - _results[(empty1_key, tail2_key)] = base_result - _results[(empty1_key, head_tail2_key)] = base_result - key_decomp2[seq2_key] = info2 - - cdef Py_hash_t full_seq1_key = hash(full_seq1) - cdef Py_hash_t full_seq2_key = hash(full_seq2) - - cdef tuple key0 = (full_seq1_key, full_seq2_key) - cdef tuple frame0 = (key0, full_seq1, full_seq2) - cdef list stack = [frame0] - - while stack: - frame = stack[-1] - key, seq1, seq2 = frame - seq1_key, seq2_key = key - if key not in _results: - info1 = key_decomp1[seq1_key] - tok1, seq1, head1, tail1, head_tail1, head1_key, tail1_key, head_tail1_key, a1, b1 = info1 - - info2 = key_decomp2[seq2_key] - tok2, seq2, head2, tail2, head_tail2, head2_key, tail2_key, head_tail2_key, a2, b2 = info2 - - affinity = node_affinity(tok1, tok2) - - # Case 2: The current edge in sequence1 is deleted - try_key = (head_tail1_key, seq2_key) - if try_key in _results: - cand1 = _results[try_key] - else: - miss_frame = try_key, head_tail1, seq2 - stack.append(miss_frame) - continue - - # Case 3: The current edge in sequence2 is deleted - try_key = (seq1_key, head_tail2_key) - if try_key in _results: - cand2 = _results[try_key] - else: - miss_frame = try_key, seq1, head_tail2 - stack.append(miss_frame) - continue - - # Case 1: The LCS involves this edge - if affinity: - try_key = (head1_key, head2_key) - if try_key in _results: - pval_h, new_heads = _results[try_key] - else: - miss_frame = try_key, head1, head2 - stack.append(miss_frame) - continue - - try_key = (tail1_key, tail2_key) - if try_key in _results: - pval_t, new_tails = _results[try_key] - else: - miss_frame = try_key, tail1, tail2 - stack.append(miss_frame) - continue - - new_head1, new_head2 = new_heads - new_tail1, new_tail2 = new_tails - - subseq1 = a1 + new_head1 + b1 + new_tail1 - subseq2 = a2 + new_head2 + b2 + new_tail2 - - res3 = (subseq1, subseq2) - val3 = pval_h + pval_t + affinity - cand3 = (val3, res3) - else: - cand3 = (-1, None) - - # We solved the frame - _results[key] = max(cand1, cand2, cand3) - stack.pop() - - # The stack pop is our solution - (val, best) = _results[key0] - found = (best, val) - return found - - - - -def _lcs_iter_simple_alt2_cython(full_seq1, full_seq2, open_to_close, node_affinity, open_to_node): - """ - Depth first stack trajectory and replace try except statements with ifs - """ - if open_to_node is None: - open_to_node = IdentityDict() - all_decomp1 = generate_all_decomp_cython(full_seq1, open_to_close, open_to_node) - all_decomp2 = generate_all_decomp_cython(full_seq2, open_to_close, open_to_node) - - key0 = (full_seq1, full_seq2) - frame0 = key0 - stack = [frame0] - - _results = {} - # Populate base cases - empty1 = type(next(iter(all_decomp1.keys())))() - empty2 = type(next(iter(all_decomp2.keys())))() - best = (empty1, empty2) - base_result = (0, best) - for seq1 in all_decomp1.keys(): - key1 = seq1 - t1, a1, b1, head1, tail1, head_tail1 = all_decomp1[key1] - _results[(seq1, empty2)] = base_result - _results[(head1, empty2)] = base_result - _results[(tail1, empty2)] = base_result - _results[(head_tail1, empty2)] = base_result - - for seq2 in all_decomp2.keys(): - key2 = seq2 - t2, a2, b2, head2, tail2, head_tail2 = all_decomp2[key2] - _results[(empty1, seq2)] = base_result - _results[(empty1, head2)] = base_result - _results[(empty1, tail2)] = base_result - _results[(empty1, head_tail2)] = base_result - - while stack: - key = stack[-1] - if key not in _results: - seq1, seq2 = key - - t1, a1, b1, head1, tail1, head_tail1 = all_decomp1[seq1] - t2, a2, b2, head2, tail2, head_tail2 = all_decomp2[seq2] - - # Case 2: The current edge in sequence1 is deleted - try_key = (head_tail1, seq2) - if try_key in _results: - cand1 = _results[try_key] - else: - # stack.append(key) - stack.append(try_key) - continue - - # Case 3: The current edge in sequence2 is deleted - try_key = (seq1, head_tail2) - if try_key in _results: - cand2 = _results[try_key] - else: - # stack.append(key) - stack.append(try_key) - continue - - # Case 1: The LCS involves this edge - affinity = node_affinity(t1, t2) - if affinity: - try_key = (head1, head2) - if try_key in _results: - pval_h, new_heads = _results[try_key] - else: - # stack.append(key) - stack.append(try_key) - continue - - try_key = (tail1, tail2) - if try_key in _results: - pval_t, new_tails = _results[try_key] - else: - # stack.append(key) - stack.append(try_key) - continue - - new_head1, new_head2 = new_heads - new_tail1, new_tail2 = new_tails - - subseq1 = a1 + new_head1 + b1 + new_tail1 - subseq2 = a2 + new_head2 + b2 + new_tail2 - - res3 = (subseq1, subseq2) - val3 = pval_h + pval_t + affinity - cand3 = (val3, res3) - else: - cand3 = (-1, None) - - # We solved the frame - _results[key] = max(cand1, cand2, cand3) - stack.pop() - - val, best = _results[key0] - found = (best, val) - return found - - -cdef tuple balanced_decomp_unsafe_cython(sequence, dict open_to_close): - """ - Cython version of :func:`balanced_decomp_unsafe`. - """ - cdef int stacklen = 1 # always +1 in the first iteration - cdef int head_stop = 1 - - tok_curr = sequence[0] - want_close = open_to_close[tok_curr] - - # for tok_curr in sequence[1:]: - for head_stop in range(1, len(sequence)): - tok_curr = sequence[head_stop] - stacklen += 1 if tok_curr in open_to_close else -1 - if stacklen == 0 and tok_curr == want_close: - pop_close = sequence[head_stop:head_stop + 1] - break - - pop_open = sequence[0:1] - head = sequence[1:head_stop] - tail = sequence[head_stop + 1:] - head_tail = head + tail - return pop_open, pop_close, head, tail, head_tail - - -cdef generate_all_decomp_cython(seq, open_to_close, open_to_node=None): - """ - Cython version of :func:`generate_all_decomp`. - """ - all_decomp = {} - stack = [seq] - while stack: - seq = stack.pop() - if seq not in all_decomp and seq: - pop_open, pop_close, head, tail, head_tail = balanced_decomp_unsafe_cython(seq, open_to_close) - node = open_to_node[pop_open[0]] - all_decomp[seq] = (node, pop_open, pop_close, head, tail, head_tail) - stack.append(head_tail) - stack.append(head) - stack.append(tail) - return all_decomp - - -cdef tuple balanced_decomp_prehash_cython(seq, dict open_to_close, open_to_node): - """ - Cython version of :func:`balanced_decomp_unsafe`. - """ - cdef tuple info - pop_open, pop_close, head, tail, head_tail = balanced_decomp_unsafe_cython(seq, open_to_close) - cdef Py_hash_t head_key = hash(head) - cdef Py_hash_t tail_key = hash(tail) - cdef Py_hash_t head_tail_key = hash(head_tail) - node = open_to_node[pop_open[0]] - a = pop_open - b = pop_close - info = (node, seq, head, tail, head_tail, head_key, tail_key, head_tail_key, a, b) - return info - - -cdef dict generate_all_decomp_prehash_cython(seq, dict open_to_close, open_to_node): - """ - Cython version of :func:`generate_all_decomp_prehash`. - """ - cdef dict all_decomp = {} - cdef list stack = [seq] - cdef tuple info - while stack: - seq = stack.pop() - if seq: - # key = hash(seq) - key = seq - if key not in all_decomp: - info = balanced_decomp_prehash_cython(seq, open_to_close, open_to_node) - head, tail, head_tail = info[2:5] - all_decomp[key] = info - stack.append(head_tail) - stack.append(head) - stack.append(tail) - return all_decomp - - -class IdentityDict: - """ Used when ``open_to_node`` is unspecified """ - def __getitem__(self, key): - return key diff --git a/netharn/initializers/_nx_ext/benchmarks.py b/netharn/initializers/_nx_ext/benchmarks.py deleted file mode 100644 index 2a9e70a93366f63cc91f8ec2a8b082f8c9a230e4..0000000000000000000000000000000000000000 --- a/netharn/initializers/_nx_ext/benchmarks.py +++ /dev/null @@ -1,387 +0,0 @@ -from netharn.initializers._nx_ext.path_embedding import ( # NOQA - maximum_common_path_embedding) -# from netharn.initializers._nx_ext.tree_embedding import ( # NOQA -# maximum_common_ordered_tree_embedding, tree_to_seq) -from netharn.initializers._nx_ext.demodata import random_paths -from netharn.initializers._nx_ext.demodata import random_ordered_tree # NOQA -import operator - - -def bench_maximum_common_path_embedding(): - """ - xdoctest -m netharn.initializers._nx_ext.benchmarks bench_maximum_common_path_embedding - """ - import itertools as it - import ubelt as ub - import timerit - from netharn.initializers._nx_ext import balanced_sequence - from netharn.initializers._nx_ext import path_embedding - - data_modes = [] - - # Define which implementations we are going to test - run_basis = { - 'mode': [ - 'chr', - # 'number' - # 'tuple', # by far the slowest - ], - 'impl': balanced_sequence.available_impls_longest_common_balanced_sequence(), - } - - # Define the properties of the random data we are going to test on - data_basis = { - 'size': [20, 50], - 'max_depth': [8, 16], - 'common': [8, 16], - 'prefix_depth1': [0, 4], - 'prefix_depth2': [0, 4], - # 'labels': [26 ** 1, 26 ** 8] - 'labels': [1, 26] - } - - # run_basis['impl'] = set(run_basis['impl']) & { - # 'iter-alt2-cython', - # 'iter-prehash2-cython', - # 'iter-prehash2', - # 'iter-alt2', - # # 'iter-alt1', - # # 'iter-prehash', - # # 'iter', - # # 'recurse' - # } - - # TODO: parametarize demo names - # BENCH_MODE = None - # BENCH_MODE = 'small' - # BENCH_MODE = 'small2' - # BENCH_MODE = 'recursion-error' - BENCH_MODE = 'medium' - # BENCH_MODE = 'large' - - if BENCH_MODE == 'small': - data_basis = { - 'size': [30], - 'max_depth': [8, 2], - 'common': [2, 8], - 'prefix_depth1': [0, 4], - 'prefix_depth2': [0], - 'labels': [4] - } - run_basis['impl'] = set(run_basis['impl']) & { - # 'iter-alt2-cython', - 'iter-prehash2-cython', - 'iter-prehash2', - # 'iter-alt2', - # 'iter', - # 'recurse', - } - run_basis['impl'] = ub.oset(balanced_sequence.available_impls_longest_common_balanced_sequence()) - { - 'recurse', - } - # runparam_to_time = { - # ('chr', 'iter-prehash2-cython'): {'mean': 0.062, 'max': 0.157}, - # ('chr', 'iter-prehash2') : {'mean': 0.071, 'max': 0.185}, - # } - - if BENCH_MODE == 'small2': - data_basis = { - 'size': [30], - 'max_depth': [8, 2], - 'common': [2, 8], - 'prefix_depth1': [0, 4], - 'prefix_depth2': [0], - 'labels': [4] - } - run_basis['impl'] = ub.oset(balanced_sequence.available_impls_longest_common_balanced_sequence()) - { - 'recurse', - } - run_basis['mode'] = ['number', 'chr'] - # runparam_to_time = { - # ('chr', 'iter-alt2-cython') : {'mean': 0.036, 'max': 0.094}, - # ('chr', 'iter-alt2') : {'mean': 0.049, 'max': 0.125}, - # ('chr', 'iter-alt1') : {'mean': 0.050, 'max': 0.129}, - # ('chr', 'iter-prehash2-cython') : {'mean': 0.057, 'max': 0.146}, - # ('number', 'iter-prehash2-cython'): {'mean': 0.057, 'max': 0.146}, - # ('chr', 'iter') : {'mean': 0.064, 'max': 0.167}, - # ('chr', 'iter-prehash2') : {'mean': 0.066, 'max': 0.170}, - # ('number', 'iter-prehash2') : {'mean': 0.067, 'max': 0.176}, - # ('chr', 'iter-prehash') : {'mean': 0.073, 'max': 0.187}, - # ('number', 'iter-prehash') : {'mean': 0.074, 'max': 0.196}, - # ('number', 'iter-alt1') : {'mean': 0.126, 'max': 0.344}, - # ('number', 'iter-alt2-cython') : {'mean': 0.133, 'max': 0.363}, - # ('number', 'iter') : {'mean': 0.140, 'max': 0.386}, - # ('number', 'iter-alt2') : {'mean': 0.149, 'max': 0.408}, - # } - - if BENCH_MODE == 'medium': - data_basis = { - 'size': [30, 40], - 'max_depth': [4, 8], - 'common': [8, 50], - 'prefix_depth1': [0, 4], - 'prefix_depth2': [2], - 'labels': [8, 1] - } - # Results - # runparam_to_time = { - # ('chr', 'iter-alt2-cython') : {'mean': 0.112, 'max': 0.467}, - # ('chr', 'recurse') : {'mean': 0.153, 'max': 0.648}, - # ('chr', 'iter-alt2') : {'mean': 0.155, 'max': 0.661}, - # ('chr', 'iter-alt1') : {'mean': 0.163, 'max': 0.707}, - # ('chr', 'iter-prehash2-cython'): {'mean': 0.197, 'max': 0.849}, - # ('chr', 'iter') : {'mean': 0.216, 'max': 0.933}, - # ('chr', 'iter-prehash2') : {'mean': 0.225, 'max': 0.974}, - # ('chr', 'iter-prehash') : {'mean': 0.253, 'max': 1.097}, - # } - - if BENCH_MODE == 'large': - data_basis = { - 'size': [30, 40], - 'max_depth': [4, 12], # 64000 - 'common': [8, 32], - 'prefix_depth1': [0, 4], - 'prefix_depth2': [2], - 'labels': [8] - } - run_basis['impl'] = balanced_sequence.available_impls_longest_common_balanced_sequence() - # runparam_to_time = { - # ('chr', 'iter-alt2-cython') : {'mean': 0.282, 'max': 0.923}, - # ('chr', 'recurse') : {'mean': 0.397, 'max': 1.297}, - # ('chr', 'iter-alt2') : {'mean': 0.409, 'max': 1.328}, - # ('chr', 'iter-alt1') : {'mean': 0.438, 'max': 1.428}, - # ('chr', 'iter-prehash2-cython'): {'mean': 0.511, 'max': 1.668}, - # ('chr', 'iter') : {'mean': 0.580, 'max': 1.915}, - # ('chr', 'iter-prehash2') : {'mean': 0.605, 'max': 1.962}, - # ('chr', 'iter-prehash') : {'mean': 0.679, 'max': 2.211}, - # } - - elif BENCH_MODE == 'too-big': - data_basis = { - 'size': [100], - 'max_depth': [8], - 'common': [80], - 'prefix_depth1': [4], - 'prefix_depth2': [2], - 'labels': [8] - } - if BENCH_MODE == 'recursion-error': - data_basis = { - 'size': [0], - 'max_depth': [512], - 'common': [4], - 'prefix_depth1': [0], - 'prefix_depth2': [0], - 'labels': [256] - } - run_basis['impl'] = ub.oset(['recurse']) | ub.oset(balanced_sequence.available_impls_longest_common_balanced_sequence()) - # Results - # complexity = 69.48 - # stats1 = {'depth': 395, 'n_edges': 1203, 'n_leafs': 4, 'n_nodes': 1207, 'npaths': 4} - # stats2 = {'depth': 395, 'n_edges': 1203, 'n_leafs': 4, 'n_nodes': 1207, 'npaths': 4} - # runparam_to_time = { - # ('chr', 'recurse') : {'mean': NAN, 'max': NAN}, - # ('chr', 'iter-alt2-cython') : {'mean': 7.979, 'max': 7.979}, - # ('chr', 'iter-alt2') : {'mean': 11.307, 'max': 11.307}, - # ('chr', 'iter-alt1') : {'mean': 11.659, 'max': 11.659}, - # ('chr', 'iter-prehash2-cython'): {'mean': 15.230, 'max': 15.230}, - # ('chr', 'iter-prehash2') : {'mean': 17.058, 'max': 17.058}, - # ('chr', 'iter') : {'mean': 18.377, 'max': 18.377}, - # ('chr', 'iter-prehash') : {'mean': 19.508, 'max': 19.508}, - # } - - data_modes = [ - dict(zip(data_basis.keys(), vals)) - for vals in it.product(*data_basis.values())] - run_modes = [ - dict(zip(run_basis.keys(), vals)) - for vals in it.product(*run_basis.values())] - - print('len(data_modes) = {!r}'.format(len(data_modes))) - print('len(run_modes) = {!r}'.format(len(run_modes))) - print('total = {}'.format(len(data_modes) * len(run_modes))) - - seed = 0 - # if len(data_modes) < 10: - # for datakw in data_modes: - # _datakw = ub.dict_diff(datakw, {'complexity'}) - # paths1, paths2 = random_paths(seed=seed, **datakw) - # print('paths1 = {}'.format(ub.repr2(paths1, nl=1))) - # print('paths2 = {}'.format(ub.repr2(paths2, nl=1))) - # print('---') - for idx, datakw in enumerate(data_modes): - print('datakw = {}'.format(ub.repr2(datakw, nl=1))) - _datakw = ub.dict_diff(datakw, {'complexity'}) - paths1, paths2 = random_paths(seed=seed, **_datakw) - tree1 = path_embedding.paths_to_otree(paths1) - tree2 = path_embedding.paths_to_otree(paths2) - stats1 = { - 'npaths': len(paths1), - 'n_nodes': len(tree1.nodes), - 'n_edges': len(tree1.edges), - 'n_leafs': len([n for n in tree1.nodes if len(tree1.succ[n]) == 0]), - 'depth': max(len(p.split('/')) for p in paths1), - } - stats2 = { - 'npaths': len(paths2), - 'n_nodes': len(tree2.nodes), - 'n_edges': len(tree2.edges), - 'n_leafs': len([n for n in tree2.nodes if len(tree2.succ[n]) == 0]), - 'depth': max(len(p.split('/')) for p in paths2), - } - complexity = ( - stats1['n_nodes'] * min(stats1['n_leafs'], stats1['depth']) * - stats2['n_nodes'] * min(stats2['n_leafs'], stats2['depth'])) ** .25 - - datakw['complexity'] = complexity - print('datakw = {}'.format(ub.repr2(datakw, nl=0, precision=2))) - - if True: - # idx + 4 > len(data_modes): - print('stats1 = {}'.format(ub.repr2(stats1, nl=0))) - print('stats2 = {}'.format(ub.repr2(stats2, nl=0))) - # print('complexity = {:.2f}'.format(complexity)) - - total = len(data_modes) * len(run_modes) - print('len(data_modes) = {!r}'.format(len(data_modes))) - print('len(run_modes) = {!r}'.format(len(run_modes))) - print('total = {!r}'.format(total)) - seed = 0 - - prog = ub.ProgIter(total=total, verbose=3) - prog.begin() - results = [] - ti = timerit.Timerit(1, bestof=1, verbose=1, unit='s') - for datakw in data_modes: - _datakw = ub.dict_diff(datakw, {'complexity'}) - paths1, paths2 = random_paths(seed=seed, **_datakw) - print('---') - prog.step(4) - tree1 = path_embedding.paths_to_otree(paths1) - tree2 = path_embedding.paths_to_otree(paths2) - stats1 = { - 'npaths': len(paths1), - 'n_nodes': len(tree1.nodes), - 'n_edges': len(tree1.edges), - 'n_leafs': len([n for n in tree1.nodes if len(tree1.succ[n]) == 0]), - 'depth': max(len(p.split('/')) for p in paths1), - } - stats2 = { - 'npaths': len(paths2), - 'n_nodes': len(tree2.nodes), - 'n_edges': len(tree2.edges), - 'n_leafs': len([n for n in tree2.nodes if len(tree2.succ[n]) == 0]), - 'depth': max(len(p.split('/')) for p in paths2), - } - complexity = ( - stats1['n_nodes'] * min(stats1['n_leafs'], stats1['depth']) * - stats2['n_nodes'] * min(stats2['n_leafs'], stats2['depth'])) ** .25 - - datakw['complexity'] = complexity - print('datakw = {}'.format(ub.repr2(datakw, nl=0, precision=2))) - - if True: - # idx + 4 > len(data_modes): - print('stats1 = {}'.format(ub.repr2(stats1, nl=0))) - print('stats2 = {}'.format(ub.repr2(stats2, nl=0))) - for runkw in run_modes: - paramkw = {**datakw, **runkw} - run_key = ub.repr2( - paramkw, sep='', itemsep='', kvsep='', - explicit=1, nobr=1, nl=0, precision=1) - try: - for timer in ti.reset(run_key): - with timer: - maximum_common_path_embedding(paths1, paths2, **runkw) - except RecursionError as ex: - print('ex = {!r}'.format(ex)) - row = paramkw.copy() - row['time'] = float('nan') - else: - row = paramkw.copy() - row['time'] = ti.min() - results.append(row) - prog.end() - - print(ub.repr2(ub.sorted_vals(ti.measures['min']), nl=1, align=':', precision=6)) - - import pandas as pd - import kwarray - df = pd.DataFrame.from_dict(results) - - dataparam_to_time = {} - for mode, subdf in df.groupby(['complexity'] + list(data_basis.keys())): - stats = kwarray.stats_dict(subdf['time']) - stats.pop('min', None) - stats.pop('std', None) - stats.pop('shape', None) - dataparam_to_time[mode] = stats - dataparam_to_time = ub.sorted_vals(dataparam_to_time, key=lambda x: x['max']) - print('dataparam_to_time = {}'.format(ub.repr2(dataparam_to_time, nl=1, precision=3, align=':'))) - print(list(data_basis.keys())) - - runparam_to_time = {} - for mode, subdf in df.groupby(['mode', 'impl']): - stats = kwarray.stats_dict(subdf['time']) - stats.pop('min', None) - stats.pop('std', None) - stats.pop('shape', None) - runparam_to_time[mode] = stats - runparam_to_time = ub.sorted_vals(runparam_to_time, key=lambda x: x['max']) - print('runparam_to_time = {}'.format(ub.repr2(runparam_to_time, nl=1, precision=3, align=':'))) - - -def benchmark_balanced_sequence_single(): - from netharn.initializers._nx_ext import balanced_sequence - from netharn.initializers._nx_ext import demodata - import ubelt as ub - mode = 'number' - seq1, open_to_close = demodata.random_balanced_sequence(200, mode=mode) - seq2, open_to_close = demodata.random_balanced_sequence(400, mode=mode, open_to_close=open_to_close) - longest_common_balanced_sequence = balanced_sequence.longest_common_balanced_sequence - impls = balanced_sequence.available_impls_longest_common_balanced_sequence() - results = {} - for impl in impls: - with ub.Timer(impl): - best, val = longest_common_balanced_sequence( - seq1, seq2, open_to_close, node_affinity=None, impl=impl) - results[impl] = val - assert allsame(results.values()) - - -def allsame(iterable, eq=operator.eq): - """ - Determine if all items in a sequence are the same - - Args: - iterable (Iterable[A]): - items to determine if they are all the same - - eq (Callable[[A, A], bool], default=operator.eq): - function used to test for equality - - Returns: - bool: True if all items are equal, otherwise False - - Example: - >>> allsame([1, 1, 1, 1]) - True - >>> allsame([]) - True - >>> allsame([0, 1]) - False - >>> iterable = iter([0, 1, 1, 1]) - >>> next(iterable) - >>> allsame(iterable) - True - >>> allsame(range(10)) - False - >>> allsame(range(10), lambda a, b: True) - True - """ - iter_ = iter(iterable) - try: - first = next(iter_) - except StopIteration: - return True - return all(eq(first, item) for item in iter_) diff --git a/netharn/initializers/_nx_ext/demodata.py b/netharn/initializers/_nx_ext/demodata.py deleted file mode 100644 index edcd827de0d9af6010adedb34087b6e5548dfe1d..0000000000000000000000000000000000000000 --- a/netharn/initializers/_nx_ext/demodata.py +++ /dev/null @@ -1,239 +0,0 @@ -""" -Helpers for creating random data for tests / benchmarks for the tree embedding -algorithms. -""" - - -def random_paths( - size=10, max_depth=10, common=0, prefix_depth1=0, prefix_depth2=0, - sep='/', labels=26, seed=None): - """ - Returns two randomly created paths (as in directory structures) for use in - testing and benchmarking :func:`maximum_common_path_embedding`. - - Parameters - ---------- - size : int - The number of independant random paths - - max_depth : int - Maximum depth for the independant random paths - - common : int - The number of shared common paths - - prefix_depth1: int - Depth of the random prefix attacheded to first common paths - - prefix_depth2: int - Depth of the random prefix attacheded to second common paths - - labels: int or collection - Number of or collection of tokens that can be used as node labels - - sep: str - path separator - - seed: - Random state or seed - - Examples - -------- - >>> paths1, paths2 = random_paths( - >>> size=5, max_depth=3, common=6, - >>> prefix_depth1=3, prefix_depth2=3, labels=2 ** 64, - >>> seed=0) - >>> from netharn.initializers._nx_ext.path_embedding import paths_to_otree - >>> from netharn.initializers._nx_ext.tree_embedding import tree_to_seq - >>> tree = paths_to_otree(paths1) - >>> seq, open_to_close, node_to_open = tree_to_seq(tree, mode='chr') - >>> seq, open_to_close, node_to_open = tree_to_seq(tree, mode='number') - >>> seq, open_to_close, node_to_open = tree_to_seq(tree, mode='tuple') - >>> # xdoctest: +REQUIRES(module:ubelt) - >>> import ubelt as ub - >>> print('paths1 = {}'.format(ub.repr2(paths1, nl=1))) - >>> print('paths2 = {}'.format(ub.repr2(paths2, nl=1))) - """ - from networkx.utils import create_py_random_state - rng = create_py_random_state(seed) - - if isinstance(labels, int): - alphabet = list(map(chr, range(ord('a'), ord('z')))) - - def random_label(): - digit = rng.randint(0, labels) - label = _convert_digit_base(digit, alphabet) - return label - else: - from functools import partial - random_label = partial(rng.choice, labels) - - def random_path(rng, max_depth): - depth = rng.randint(1, max_depth) - parts = [str(random_label()) for _ in range(depth)] - path = sep.join(parts) - return path - - # These paths might be shared (but usually not) - iid_paths1 = {random_path(rng, max_depth) for _ in range(size)} - iid_paths2 = {random_path(rng, max_depth) for _ in range(size)} - - # These paths will be shared - common_paths = {random_path(rng, max_depth) for _ in range(common)} - - if prefix_depth1 > 0: - prefix1 = random_path(rng, prefix_depth1) - common1 = {sep.join([prefix1, suff]) for suff in common_paths} - else: - common1 = common_paths - - if prefix_depth2 > 0: - prefix2 = random_path(rng, prefix_depth2) - common2 = {sep.join([prefix2, suff]) for suff in common_paths} - else: - common2 = common_paths - - paths1 = sorted(common1 | iid_paths1) - paths2 = sorted(common2 | iid_paths2) - - return paths1, paths2 - - -def random_ordered_tree(n, seed=None): - """ - Creates a random ordered tree - - TODO - ---- - - [ ] Rename to random_ordered_directed_tree ? - - [ ] Merge in with other data generators? - - Parameters - ---------- - n : int - A positive integer representing the number of nodes in the tree. - - seed : integer, random_state, or None (default) - Indicator of random number generation state. - See :ref:`Randomness`. - - Returns - ------- - networkx.OrderedDiGraph - - Example - ------- - >>> assert len(random_ordered_tree(n=1, seed=0).nodes) == 1 - >>> assert len(random_ordered_tree(n=2, seed=0).nodes) == 2 - >>> assert len(random_ordered_tree(n=3, seed=0).nodes) == 3 - >>> from netharn.initializers._nx_ext.tree_embedding import forest_str - >>> print(forest_str(random_ordered_tree(n=5, seed=3))) - └── 1 - ├── 4 - │   ├── 3 - │   └── 2 - └── 0 - """ - import networkx as nx - from networkx.utils import create_py_random_state - rng = create_py_random_state(seed) - # Create a random undirected tree - utree = nx.random_tree(n, seed=rng) - # Use a random root node and dfs to define edge directions - nodes = list(utree.nodes) - source = rng.choice(nodes) - edges = nx.dfs_edges(utree, source=source) - # Populate the ordered graph - otree = nx.OrderedDiGraph() - otree.add_nodes_from(utree.nodes) - otree.add_edges_from(edges) - return otree - - -def random_balanced_sequence(n, seed=None, mode='chr', open_to_close=None): - r""" - Creates a random balanced sequence for testing / benchmarks - - Parameters - ---------- - n : int - A positive integer representing the number of nodes in the tree. - - seed : integer, random_state, or None (default) - Indicator of random number generation state. - See :ref:`Randomness`. - - open_to_close : dict | None - if specified, updates existing open_to_close with tokens from this - sequence. - - mode: str - the type of sequence returned (see :func:`tree_to_seq` for details) - - Returns - ------- - : tuple - The first item is the sequence itself - the second item is the open_to_close mappings. - - Example - ------- - >>> # Demo the various sequence encodings that we might use - >>> seq, open_to_close = random_balanced_sequence(2, seed=1, mode='tuple') - >>> print('seq = {!r}'.format(seq)) - >>> seq, open_to_close = random_balanced_sequence(4, seed=1, mode='chr') - >>> print('seq = {!r}'.format(seq)) - >>> seq, open_to_close = random_balanced_sequence(4, seed=1, mode='number') - >>> print('seq = {!r}'.format(seq)) - >>> seq, open_to_close = random_balanced_sequence(4, seed=1, mode='str') - >>> print('seq = {!r}'.format(seq)) - >>> seq, open_to_close = random_balanced_sequence(10, seed=1, mode='paren') - >>> print('seq = {!r}'.format(seq)) - seq = (('open', 0), ('open', 1), ('close', 1), ('close', 0)) - seq = '\x00\x02\x04\x06\x07\x05\x03\x01' - seq = (1, 2, 3, 4, -4, -3, -2, -1) - seq = ('2(', '1(', '0(', '3(', ')3', ')0', ')1', ')2') - seq = '([[[]{{}}](){{[]}}])' - """ - from networkx.utils import create_py_random_state - from netharn.initializers._nx_ext.tree_embedding import tree_to_seq - # Create a random otree and then convert it to a balanced sequence - rng = create_py_random_state(seed) - tree = random_ordered_tree(n, seed=rng) - if mode == 'paren': - pool = '[{(' - for node in tree.nodes: - tree.nodes[node]['label'] = rng.choice(pool) - seq, open_to_close, _ = tree_to_seq( - tree, mode=mode, open_to_close=open_to_close, strhack=1) - else: - seq, open_to_close, _ = tree_to_seq( - tree, mode=mode, open_to_close=open_to_close) - return seq, open_to_close - - -def _convert_digit_base(digit, alphabet): - """ - Parameters - ---------- - digit : int - number in base 10 to convert - - alphabet : list - symbols of the conversion base - """ - baselen = len(alphabet) - x = digit - if x == 0: - return alphabet[0] - sign = 1 if x > 0 else -1 - x *= sign - digits = [] - while x: - digits.append(alphabet[x % baselen]) - x //= baselen - if sign < 0: - digits.append('-') - digits.reverse() - newbase_str = ''.join(digits) - return newbase_str diff --git a/netharn/initializers/_nx_ext/path_embedding.py b/netharn/initializers/_nx_ext/path_embedding.py deleted file mode 100644 index 0a7a71a532b18a59053c6efbe664875d05b38489..0000000000000000000000000000000000000000 --- a/netharn/initializers/_nx_ext/path_embedding.py +++ /dev/null @@ -1,143 +0,0 @@ -import networkx as nx -from .tree_embedding import maximum_common_ordered_tree_embedding - - -def maximum_common_path_embedding(paths1, paths2, sep='/', impl='iter-alt2', mode='chr'): - """ - Finds the maximum path embedding common between two sets of paths - - Parameters - ---------- - paths1, paths2: List[str] - a list of paths - - sep: str - path separator character - - impl: str - backend runtime to use - - mode: str - backend representation to use - - Returns - ------- - :tuple - corresponding lists subpaths1 and subpaths2 which are subsets of - paths1 and path2 respectively - - Examples - -------- - >>> paths1 = [ - >>> '/usr/bin/python', - >>> '/usr/bin/python3.6.1', - >>> '/usr/lib/python3.6/dist-packages/networkx', - >>> '/usr/lib/python3.6/dist-packages/numpy', - >>> '/usr/include/python3.6/Python.h', - >>> ] - >>> paths2 = [ - >>> '/usr/local/bin/python', - >>> '/usr/bin/python3.6.2', - >>> '/usr/local/lib/python3.6/dist-packages/networkx', - >>> '/usr/local/lib/python3.6/dist-packages/scipy', - >>> '/usr/local/include/python3.6/Python.h', - >>> ] - >>> subpaths1, subpaths2 = maximum_common_path_embedding(paths1, paths2) - >>> import pprint - >>> print('subpaths1 = {}'.format(pprint.pformat(subpaths1))) - >>> print('subpaths2 = {}'.format(pprint.pformat(subpaths2))) - subpaths1 = ['/usr/bin/python', - '/usr/include/python3.6/Python.h', - '/usr/lib/python3.6/dist-packages/networkx'] - subpaths2 = ['/usr/local/bin/python', - '/usr/local/include/python3.6/Python.h', - '/usr/local/lib/python3.6/dist-packages/networkx'] - """ - # the longest common balanced sequence problem - def _affinity(node1, node2): - score = 0 - for t1, t2 in zip(node1[::-1], node2[::-1]): - if t1 == t2: - score += 1 - else: - break - return score - node_affinity = _affinity - - tree1 = paths_to_otree(paths1, sep=sep) - tree2 = paths_to_otree(paths2, sep=sep) - - subtree1, subtree2 = maximum_common_ordered_tree_embedding( - tree1, tree2, node_affinity=node_affinity, impl=impl, mode=mode) - - subpaths1 = [sep.join(node) for node in subtree1.nodes if subtree1.out_degree[node] == 0] - subpaths2 = [sep.join(node) for node in subtree2.nodes if subtree2.out_degree[node] == 0] - return subpaths1, subpaths2 - - -def paths_to_otree(paths, sep='/'): - """ - Generates an ordered tree from a list of path strings - - Parameters - ---------- - paths: List[str] - a list of paths - - sep : str - path separation character. defaults to "/" - - Returns - ------- - nx.OrderedDiGraph - - Example - ------- - >>> from netharn.initializers._nx_ext.tree_embedding import forest_str - >>> paths = [ - >>> '/etc/ld.so.conf', - >>> '/usr/bin/python3.6', - >>> '/usr/include/python3.6/Python.h', - >>> '/usr/lib/python3.6/config-3.6m-x86_64-linux-gnu/libpython3.6.so', - >>> '/usr/local/bin/gnumake.h', - >>> '/usr/local/etc', - >>> '/usr/local/lib/python3.6/dist-packages/', - >>> ] - >>> otree = paths_to_otree(paths) - >>> print(forest_str(otree)) - └── / - ├── usr - │   ├── local - │   │   ├── lib - │   │   │   └── python3.6 - │   │   │   └── dist-packages - │   │   │   └── - │   │   ├── etc - │   │   └── bin - │   │   └── gnumake.h - │   ├── lib - │   │   └── python3.6 - │   │   └── config-3.6m-x86_64-linux-gnu - │   │   └── libpython3.6.so - │   ├── include - │   │   └── python3.6 - │   │   └── Python.h - │   └── bin - │   └── python3.6 - └── etc - └── ld.so.conf - """ - otree = nx.OrderedDiGraph() - for path in sorted(paths): - parts = tuple(path.split(sep)) - node_path = [] - for i in range(1, len(parts) + 1): - node = parts[0:i] - otree.add_node(node) - otree.nodes[node]['label'] = node[-1] - node_path.append(node) - for u, v in zip(node_path[:-1], node_path[1:]): - otree.add_edge(u, v) - if ('',) in otree.nodes: - otree.nodes[('',)]['label'] = sep - return otree diff --git a/netharn/initializers/_nx_ext/tests/test_balanced_sequence.py b/netharn/initializers/_nx_ext/tests/test_balanced_sequence.py deleted file mode 100644 index ed88346ee23d2db4aec9628db25884244e57b64d..0000000000000000000000000000000000000000 --- a/netharn/initializers/_nx_ext/tests/test_balanced_sequence.py +++ /dev/null @@ -1,32 +0,0 @@ - - -def test_all_implementations_are_same(): - """ - Tests several random sequences - """ - from netharn.initializers._nx_ext import balanced_sequence - from netharn.initializers._nx_ext import demodata - from networkx.utils import create_py_random_state - - seed = 93024896892223032652928827097264 - rng = create_py_random_state(seed) - - maxsize = 20 - num_trials = 5 - - for _ in range(num_trials): - n1 = rng.randint(1, maxsize) - n2 = rng.randint(1, maxsize) - - seq1, open_to_close = demodata.random_balanced_sequence(n1, seed=rng) - seq2, open_to_close = demodata.random_balanced_sequence(n2, open_to_close=open_to_close, seed=rng) - longest_common_balanced_sequence = balanced_sequence.longest_common_balanced_sequence - - # Note: the returned sequences may be different (maximum embeddings may not - # be unique), but the values should all be the same. - results = {} - impls = balanced_sequence.available_impls_longest_common_balanced_sequence() - for impl in impls: - best, val = longest_common_balanced_sequence( - seq1, seq2, open_to_close, node_affinity=None, impl=impl) - results[impl] = val diff --git a/netharn/initializers/_nx_ext/tests/test_path_embedding.py b/netharn/initializers/_nx_ext/tests/test_path_embedding.py deleted file mode 100644 index 82f2cd9b4f28c83f8bd1c19d1ae6897d06cc20c5..0000000000000000000000000000000000000000 --- a/netharn/initializers/_nx_ext/tests/test_path_embedding.py +++ /dev/null @@ -1,260 +0,0 @@ -from netharn.initializers._nx_ext.path_embedding import maximum_common_path_embedding -from netharn.initializers._nx_ext.demodata import random_paths - - -def test_not_compatable(): - paths1 = [ - 'foo/bar' - ] - paths2 = [ - 'baz/biz' - ] - embedding1, embedding2 = maximum_common_path_embedding(paths1, paths2) - assert len(embedding1) == 0 - assert len(embedding2) == 0 - - -def test_compatable(): - paths1 = [ - 'root/suffix1' - ] - paths2 = [ - 'root/suffix2' - ] - embedding1, embedding2 = maximum_common_path_embedding(paths1, paths2) - assert embedding1 == ['root'] - assert embedding2 == ['root'] - - paths1 = [ - 'root/suffix1' - ] - paths2 = [ - 'root' - ] - embedding1, embedding2 = maximum_common_path_embedding(paths1, paths2) - assert embedding1 == ['root'] - assert embedding2 == ['root'] - - -def test_prefixed(): - paths1 = [ - 'prefix1/root/suffix1' - ] - paths2 = [ - 'root/suffix2' - ] - embedding1, embedding2 = maximum_common_path_embedding(paths1, paths2) - assert embedding1 == ['prefix1/root'] - assert embedding2 == ['root'] - - paths1 = [ - 'prefix1/root/suffix1' - ] - paths2 = [ - 'prefix1/root/suffix2' - ] - embedding1, embedding2 = maximum_common_path_embedding(paths1, paths2) - assert embedding1 == ['prefix1/root'] - assert embedding2 == ['prefix1/root'] - - -def test_simple1(): - paths1 = [ - 'root/file1', - 'root/file2', - 'root/file3', - ] - paths2 = [ - 'prefix1/root/file1', - 'prefix1/root/file2', - 'root/file3', - ] - embedding1, embedding2 = maximum_common_path_embedding(paths1, paths2) - assert embedding1 == paths1 - assert embedding2 == paths2 - - paths1 = [ - 'root/file1', - 'root/file2', - 'root/file3', - ] - paths2 = [ - 'prefix1/root/file1', - 'prefix1/root/file2', - 'prefix2/root/file3', - 'prefix2/root/file4', - ] - embedding1, embedding2 = maximum_common_path_embedding(paths1, paths2) - assert embedding1 == paths1 - - -def test_random1(): - paths1, paths2 = random_paths(10, seed=321) - embedding1, embedding2 = maximum_common_path_embedding(paths1, paths2) - - -def _demodata_resnet_module_state(arch): - """ - Construct paths corresponding to resnet convnet state keys to - simulate a real world use case for path-embeddings. - - Ignore - ------ - # Check to make sure the demodata agrees with real data - import torchvision - paths_true = list(torchvision.models.resnet50().state_dict().keys()) - paths_demo = _demodata_resnet_module_state('resnet50') - print(ub.hzcat([ub.repr2(paths_true, nl=2), ub.repr2(paths_demo)])) - assert paths_demo == paths_true - - paths_true = list(torchvision.models.resnet18().state_dict().keys()) - paths_demo = _demodata_resnet_module_state('resnet18') - print(ub.hzcat([ub.repr2(paths_true, nl=2), ub.repr2(paths_demo)])) - assert paths_demo == paths_true - - paths_true = list(torchvision.models.resnet152().state_dict().keys()) - paths_demo = _demodata_resnet_module_state('resnet152') - print(ub.hzcat([ub.repr2(paths_true, nl=2), ub.repr2(paths_demo)])) - assert paths_demo == paths_true - """ - if arch == 'resnet18': - block_type = 'basic' - layer_blocks = [2, 2, 2, 2] - elif arch == 'resnet50': - block_type = 'bottleneck' - layer_blocks = [3, 4, 6, 3] - elif arch == 'resnet152': - block_type = 'bottleneck' - layer_blocks = [3, 8, 36, 3] - else: - raise KeyError(arch) - paths = [] - paths += [ - 'conv1.weight', - 'bn1.weight', - 'bn1.bias', - 'bn1.running_mean', - 'bn1.running_var', - 'bn1.num_batches_tracked', - ] - if block_type == 'bottleneck': - num_convs = 3 - elif block_type == 'basic': - num_convs = 2 - else: - raise KeyError(block_type) - - for layer_idx, nblocks in enumerate(layer_blocks, start=1): - for block_idx in range(0, nblocks): - prefix = 'layer{}.{}.'.format(layer_idx, block_idx) - - for conv_idx in range(1, num_convs + 1): - paths += [ - prefix + 'conv{}.weight'.format(conv_idx), - prefix + 'bn{}.weight'.format(conv_idx), - prefix + 'bn{}.bias'.format(conv_idx), - prefix + 'bn{}.running_mean'.format(conv_idx), - prefix + 'bn{}.running_var'.format(conv_idx), - prefix + 'bn{}.num_batches_tracked'.format(conv_idx), - ] - if block_idx == 0 and layer_idx > 0: - if block_type != 'basic' or layer_idx > 1: - paths += [ - prefix + 'downsample.0.weight', - prefix + 'downsample.1.weight', - prefix + 'downsample.1.bias', - prefix + 'downsample.1.running_mean', - prefix + 'downsample.1.running_var', - prefix + 'downsample.1.num_batches_tracked', - ] - paths += [ - 'fc.weight', - 'fc.bias', - ] - return paths - - -def test_realworld_case1(): - """ - import torchvision - paths1 = list(torchvision.models.resnet50().state_dict().keys()) - - print(ub.hzcat(['paths1 = {}'.format(ub.repr2(paths1, nl=2)), ub.repr2(paths)])) - len(paths1) - """ - # times: resnet18: 0.16 seconds - # times: resnet50: 0.93 seconds - # times: resnet152: 9.83 seconds - paths1 = _demodata_resnet_module_state('resnet50') - paths2 = ['module.' + p for p in paths1] - # import ubelt as ub - # with ub.Timer('test-real-world-case'): - embedding1, embedding2 = maximum_common_path_embedding( - paths1, paths2, sep='.') - assert [p[len('module.'):] for p in embedding2] == embedding1 - - -def test_realworld_case2(): - """ - import torchvision - paths1 = list(torchvision.models.resnet152().state_dict().keys()) - print('paths1 = {}'.format(ub.repr2(paths1, nl=2))) - """ - backbone = _demodata_resnet_module_state('resnet18') - - # Detector strips of prefix and suffix of the backbone net - subpaths = ['detector.' + p for p in backbone[6:-2]] - paths1 = [ - 'detector.conv1.weight', - 'detector.bn1.weight', - 'detector.bn1.bias', - ] + subpaths + [ - 'detector.head1.conv1.weight', - 'detector.head1.conv2.weight', - 'detector.head1.conv3.weight', - 'detector.head1.fc.weight', - 'detector.head1.fc.bias', - 'detector.head2.conv1.weight', - 'detector.head2.conv2.weight', - 'detector.head2.conv3.weight', - 'detector.head2.fc.weight', - 'detector.head2.fc.bias', - ] - - paths2 = ['module.' + p for p in backbone] - - # import ubelt as ub - # with ub.Timer('test-real-world-case'): - embedding1, embedding2 = maximum_common_path_embedding( - paths1, paths2, sep='.') - - mapping = dict(zip(embedding1, embedding2)) - - # Note in the embedding case there may be superfluous assignments - # but they can either be discarded in post-processing or they wont - # be in the solution if we use isomorphisms instead of embeddings - assert len(subpaths) < len(mapping), ( - 'all subpaths should be in the mapping') - - non_common1 = set(paths1) - set(embedding1) - non_common2 = set(paths2) - set(embedding2) - - assert non_common2 == { - 'module.bn1.num_batches_tracked', - 'module.bn1.running_mean', - 'module.bn1.running_var', - } - - assert non_common1 == { - 'detector.conv1.weight', - 'detector.head1.conv1.weight', - 'detector.head1.conv2.weight', - 'detector.head1.conv3.weight', - 'detector.head1.fc.bias', - 'detector.head1.fc.weight', - 'detector.head2.conv2.weight', - 'detector.head2.conv3.weight', - } - # print('non_common1 = {}'.format(ub.repr2(non_common1, nl=1))) - # print('non_common2 = {}'.format(ub.repr2(non_common2, nl=1))) - # assert [p[len('module.'):] for p in embedding2] == embedding1 diff --git a/netharn/initializers/_nx_ext/tests/test_tree_embedding.py b/netharn/initializers/_nx_ext/tests/test_tree_embedding.py deleted file mode 100644 index b9a2048601a1068066c02c9f3a8b298964e0b917..0000000000000000000000000000000000000000 --- a/netharn/initializers/_nx_ext/tests/test_tree_embedding.py +++ /dev/null @@ -1,109 +0,0 @@ -from netharn.initializers._nx_ext.tree_embedding import ( - maximum_common_ordered_tree_embedding, forest_str) - -from netharn.initializers._nx_ext.demodata import ( - random_ordered_tree -) -import networkx as nx -import pytest -from networkx.utils import create_py_random_state - - -def test_null_common_embedding(): - """ - The empty graph is not a tree and should raise an error - """ - empty = nx.OrderedDiGraph() - non_empty = random_ordered_tree(n=1) - - with pytest.raises(nx.NetworkXPointlessConcept): - maximum_common_ordered_tree_embedding(empty, empty) - - with pytest.raises(nx.NetworkXPointlessConcept): - maximum_common_ordered_tree_embedding(empty, non_empty) - - with pytest.raises(nx.NetworkXPointlessConcept): - maximum_common_ordered_tree_embedding(non_empty, empty) - - -def test_self_common_embedding(): - """ - The common embedding of a tree with itself should always be itself - """ - rng = create_py_random_state(85652972257) - for n in range(1, 10): - tree = random_ordered_tree(n=n, seed=rng) - embedding1, embedding2 = maximum_common_ordered_tree_embedding(tree, tree) - assert tree.edges == embedding1.edges - - -def test_common_tree_embedding_small(): - tree1 = nx.OrderedDiGraph([(0, 1)]) - tree2 = nx.OrderedDiGraph([(0, 1), (1, 2)]) - print(forest_str(tree1)) - print(forest_str(tree2)) - - embedding1, embedding2 = maximum_common_ordered_tree_embedding(tree1, tree2) - print(forest_str(embedding1)) - print(forest_str(embedding2)) - - -def test_common_tree_embedding_small2(): - tree1 = nx.OrderedDiGraph([(0, 1), (2, 3), (4, 5), (5, 6)]) - tree2 = nx.OrderedDiGraph([(0, 1), (1, 2), (0, 3)]) - print(forest_str(tree1)) - print(forest_str(tree2)) - - embedding1, embedding2 = maximum_common_ordered_tree_embedding(tree1, tree2, node_affinity=None) - print(forest_str(embedding1)) - print(forest_str(embedding2)) - - -def test_all_implementations_are_same(): - """ - Tests several random sequences - """ - from netharn.initializers._nx_ext import balanced_sequence - from netharn.initializers._nx_ext import demodata - from networkx.utils import create_py_random_state - - seed = 24658885408229410362279507020239 - rng = create_py_random_state(seed) - - maxsize = 20 - num_trials = 5 - - for _ in range(num_trials): - n1 = rng.randint(1, maxsize) - n2 = rng.randint(1, maxsize) - - tree1 = demodata.random_ordered_tree(n1, seed=rng) - tree2 = demodata.random_ordered_tree(n2, seed=rng) - - # Note: the returned sequences may be different (maximum embeddings may not - # be unique), but the values should all be the same. - results = {} - impls = balanced_sequence.available_impls_longest_common_balanced_sequence() - for impl in impls: - # FIXME: do we need to rework the return value here? - subtree1, subtree2 = maximum_common_ordered_tree_embedding( - tree1, tree2, node_affinity=None, impl=impl) - _check_common_embedding_invariants(tree1, tree2, subtree1, subtree2) - results[impl] = len(subtree1.nodes) - - x = max(results.values()) - assert all(v == x for v in results.values()) - - -def _check_embedding_invariants(tree, subtree): - assert set(subtree.nodes).issubset(set(tree.nodes)), 'must have a node subset' - assert len(subtree.edges) <= len(tree.edges) - - -def _check_common_embedding_invariants(tree1, tree2, subtree1, subtree2): - """ - Validates that this solution satisfies properties of an embedding - """ - _check_embedding_invariants(tree1, subtree1) - _check_embedding_invariants(tree2, subtree2) - assert len(subtree1.nodes) == len(subtree2.nodes) diff --git a/netharn/initializers/_nx_ext/tree_embedding.py b/netharn/initializers/_nx_ext/tree_embedding.py deleted file mode 100644 index 9978eb711ef316073cb17ac69f51cb055e897655..0000000000000000000000000000000000000000 --- a/netharn/initializers/_nx_ext/tree_embedding.py +++ /dev/null @@ -1,558 +0,0 @@ -""" -Algorithm for computing tree embeddings -""" -import networkx as nx -from collections import OrderedDict, defaultdict -from .balanced_sequence import longest_common_balanced_sequence, UnbalancedException - - -def maximum_common_ordered_tree_embedding( - tree1, tree2, node_affinity='auto', impl='auto', mode='chr'): - """ - Finds the maximum common subtree-embedding between two ordered trees. - - A tree S is an embedded subtree of T if it can be obtained from T by a - series of edge contractions. - - Note this produces a subtree embedding, which is not necessarilly a - subgraph isomorphism (although a subgraph isomorphism is also an - embedding.) - - The maximum common embedded subtree problem can be solved in in - `O(n1 * n2 * min(d1, l1) * min(d2, l2))` time on ordered trees with n1 and - n2 nodes, of depth d1 and d2 and with l1 and l2 leaves, respectively. - - Implements algorithm described in [1]_, which introduces the problem as - follows: - - "An important generalization of tree and subtree isomorphism, known as - minor containment, is the problem of determining whether a tree is - isomorphic to an embedded subtree of another tree, where an embedded - subtree of a tree is obtained by contracting some of the edges in the tree. - A further generalization of minor containment on trees, known as maximum - common embedded subtree, is the problem of finding or determining the size - of a largest common embedded subtree of two trees. The latter also - generalizes the maximum common subtree isomorphism problem, in which a - common subtree of largest size is contained as a subtree, not only - embedded, in the two trees." - - Parameters - ---------- - tree1, tree2 : nx.OrderedDiGraph - Trees to find the maximum embedding between - - node_affinity : None | str | callable - Function for to determine if two nodes can be matched. The return is - interpreted as a weight that is used to break ties. If None then any - node can match any other node and only the topology is important. - The default is "eq", which is the same as ``operator.eq``. - - impl : str - Determines the backend implementation - - mode : str - Determines the backend representation - - References - ---------- - .. [1] Lozano, Antoni, and Gabriel Valiente. - "On the maximum common embedded subtree problem for ordered trees." - String Algorithmics (2004): 155-170. - https://pdfs.semanticscholar.org/0b6e/061af02353f7d9b887f9a378be70be64d165.pdf - - Returns - ------- - Tuple[nx.OrderedDiGraph, nx.OrderedDiGraph] : - The maximum value common embedding for each tree with respect to the - chosen ``node_affinity`` function. The topology of both graphs will - always be the same, the only difference is that the node labels in the - first and second embeddings will correspond to ``tree1`` and `tree2`` - respectively. When ``node_affinity='eq'`` then embeddings should be - identical. - - Example - ------- - >>> from netharn.initializers._nx_ext.tree_embedding import * # NOQA - >>> from netharn.initializers._nx_ext.demodata import random_ordered_tree # NOQA - >>> tree1 = random_ordered_tree(7, seed=3257073545741117277206611) - >>> tree2 = random_ordered_tree(7, seed=123568587133124688238689717) - >>> print('tree1') - >>> forest_str(tree1, write=print) - >>> print('tree2') - >>> forest_str(tree2, write=print) - >>> embedding1, embedding2 = maximum_common_ordered_tree_embedding(tree1, tree2 ) - >>> print('embedding1') - >>> forest_str(embedding1, write=print) - >>> print('embedding2') - >>> forest_str(embedding2, write=print) - tree1 - └── 1 - ├── 6 - │   ├── 4 - │   └── 3 - └── 0 - └── 5 - └── 2 - tree2 - └── 4 - └── 1 - ├── 2 - │   ├── 6 - │   └── 0 - └── 3 - └── 5 - embedding1 - └── 1 - ├── 6 - └── 5 - embedding2 - └── 1 - ├── 6 - └── 5 - """ - if not (isinstance(tree1, nx.OrderedDiGraph) and nx.is_forest(tree1)): - raise nx.NetworkXNotImplemented('only implemented for directed ordered trees') - if not (isinstance(tree1, nx.OrderedDiGraph) and nx.is_forest(tree2)): - raise nx.NetworkXNotImplemented('only implemented for directed ordered trees') - - # Convert the trees to balanced sequences - sequence1, open_to_close, node_to_open = tree_to_seq( - tree1, open_to_close=None, node_to_open=None, mode=mode) - sequence2, open_to_close, node_to_open = tree_to_seq( - tree2, open_to_close, node_to_open, mode=mode) - seq1 = sequence1 - seq2 = sequence2 - - # FIXME: I think this may cause bugs in two cases, which may or may not be - # possible, but I need to look into it and provide a fix or justification - # as to why these cases wont be hit: - # (1) when the two trees share nodes that have different open tokens - # (2) when the mapping between nodes to opening tokens is not unique. - # I'm not sure if this second case can happen when we are converting - # from a tree to a sequence, there are certainly sequences where the - # same opening token might share multiple tree nodes. - open_to_node = invert_dict(node_to_open) - - # Solve the longest common balanced sequence problem - best, value = longest_common_balanced_sequence( - seq1, seq2, open_to_close, open_to_node=open_to_node, - node_affinity=node_affinity, impl=impl) - subseq1, subseq2 = best - - # Convert the subsequence back into a tree - embedding1 = seq_to_tree(subseq1, open_to_close, open_to_node) - embedding2 = seq_to_tree(subseq2, open_to_close, open_to_node) - return embedding1, embedding2 - - -def tree_to_seq(tree, open_to_close=None, node_to_open=None, mode='tuple', strhack=None): - r""" - Converts an ordered tree to a balanced sequence for use in algorithm - reductions. - - Parameters - ---------- - open_to_close : Dict | None - Dictionary of opening to closing tokens to be updated for problems - where multiple trees are converted to sequences. - - open_to_node : Dict | None - Dictionary of opening tokens to nodes to be updated for problems where - multiple trees are converted to sequences. - - mode : str - Currently hacky and needs refactor. - Can be 'tuple', 'number', or 'chr'. - Hackier variants are 'str' and 'paren'. - - strhack : bool - Currently hacky and needs refactor. If False, always return a tuple of - items, if True, tries to return a string of items. If None, tries to - choose a value depending on mode. - - Example - ------- - >>> from netharn.initializers._nx_ext.tree_embedding import * # NOQA - >>> tree = nx.path_graph(3, nx.OrderedDiGraph) - >>> print(forest_str(tree)) - >>> sequence, open_to_close, node_to_open = tree_to_seq(tree, mode='number') - >>> print('sequence = {!r}'.format(sequence)) - └── 0 - └── 1 - └── 2 - sequence = (1, 2, 3, -3, -2, -1) - - >>> tree = nx.balanced_tree(2, 2, nx.OrderedDiGraph) - >>> print(forest_str(tree)) - >>> sequence, open_to_close, node_to_open = tree_to_seq(tree, mode='number') - >>> print('sequence = {!r}'.format(sequence)) - └── 0 - ├── 2 - │   ├── 6 - │   └── 5 - └── 1 - ├── 4 - └── 3 - sequence = (1, 2, 3, -3, 4, -4, -2, 5, 6, -6, 7, -7, -5, -1) - - >>> from netharn.initializers._nx_ext.demodata import random_ordered_tree # NOQA - >>> tree = random_ordered_tree(2, seed=1) - >>> sequence, open_to_close, node_to_open = tree_to_seq(tree, mode='tuple') - >>> print('sequence = {!r}'.format(sequence)) - >>> sequence, open_to_close, node_to_open = tree_to_seq(tree, mode='chr') - >>> print('sequence = {!r}'.format(sequence)) - >>> sequence, open_to_close, node_to_open = tree_to_seq(tree, mode='number') - >>> print('sequence = {!r}'.format(sequence)) - sequence = (('open', 0), ('open', 1), ('close', 1), ('close', 0)) - sequence = '\x00\x02\x03\x01' - sequence = (1, 2, -2, -1) - """ - # mapping between opening and closing tokens - sources = [n for n in tree.nodes if tree.in_degree[n] == 0] - sequence = [] - - if strhack is None: - if mode == 'chr': - strhack = True - - if open_to_close is None: - open_to_close = {} - if node_to_open is None: - node_to_open = {} - - if strhack: - if mode == 'paren': - all_labels = {n['label'] for n in list(tree.nodes.values())} - assert all(x == 1 for x in map(len, all_labels)) - - for source in sources: - for u, v, etype in nx.dfs_labeled_edges(tree, source=source): - if etype == 'forward': - # u has been visited by v has not - if v not in node_to_open: - if mode == 'tuple': - open_tok = ('open', v) - close_tok = ('close', v) - elif mode == 'number': - open_tok = len(node_to_open) + 1 - close_tok = -open_tok - elif mode == 'str': - open_tok = '{}('.format(v) - close_tok = '){}'.format(v) - elif mode == 'chr': - if not strhack: - # note ussing the accent mark wont work in string - # mode even though the close tok renders as a - # single character. - open_tok = str(v) - close_tok = str(v) + u'\u0301' - else: - # utf8 can only encode this many chars - assert len(node_to_open) < (1112064 // 2) - open_tok = chr(len(node_to_open) * 2) - close_tok = chr(len(node_to_open) * 2 + 1) - elif mode == 'paren': - open_tok = tree.nodes[v]['label'] - assert strhack - if open_tok == '{': - close_tok = '}' - elif open_tok == '[': - close_tok = ']' - elif open_tok == '(': - close_tok = ')' - else: - raise KeyError(open_tok) - else: - raise KeyError(mode) - node_to_open[v] = open_tok - open_to_close[open_tok] = close_tok - open_tok = node_to_open[v] - sequence.append(open_tok) - elif etype == 'reverse': - # Both u and v are visited and the edge is in the tree - close_tok = open_to_close[node_to_open[v]] - sequence.append(close_tok) - else: - raise KeyError(etype) - sequence = tuple(sequence) - if strhack: - sequence = ''.join(sequence) - return sequence, open_to_close, node_to_open - - -def seq_to_tree(subseq, open_to_close, open_to_node): - """ - Converts a balanced sequence to an ordered tree - - Parameters - ---------- - subseq : Tuple | str - a balanced sequence of hashable items as a string or tuple - - open_to_close : Dict - a dictionary that maps opening tokens to closing tokens in the balanced - sequence problem. - - open_to_node : Dict - a dictionary that maps a sequence token to a node corresponding to an - original problem (e.g. a tree node). Must be unique. If unspecified new - nodes will be generated and the opening sequence token will be used as - a node label. - - Example - -------- - >>> from netharn.initializers._nx_ext.demodata import random_ordered_tree - >>> open_to_close = {'{': '}', '(': ')', '[': ']'} - >>> open_to_node = None - >>> subseq = '({[[]]})[[][]]{{}}' - >>> subtree = seq_to_tree(subseq, open_to_close, open_to_node) - >>> print(forest_str(subtree)) - ├── { - │   └── { - ├── [ - │   ├── [ - │   └── [ - └── ( - └── { - └── [ - └── [ - """ - nextnode = 0 # only used if open_to_node is not specified - subtree = nx.OrderedDiGraph() - stack = [] - for token in subseq: - if token in open_to_close: - if open_to_node is None: - node = nextnode - nextnode += 1 - else: - node = open_to_node[token] - if stack: - parent_tok, parent_node = stack[-1] - subtree.add_edge(parent_node, node) - else: - subtree.add_node(node) - if open_to_node is None: - subtree.nodes[node]['label'] = token - stack.append((token, node)) - else: - if not stack: - raise UnbalancedException - prev_open, prev_node = stack.pop() - want_close = open_to_close[prev_open] - if token != want_close: - raise UnbalancedException - return subtree - - -def invert_dict(dict_, unique_vals=True): - """ - Swaps the keys and values in a dictionary. - - Parameters - ---------- - dict_ (Dict[A, B]): dictionary to invert - - unique_vals (bool, default=True): if False, the values of the new - dictionary are sets of the original keys. - - Returns - ------- - Dict[B, A] | Dict[B, Set[A]]: - the inverted dictionary - - Notes - ----- - The must values be hashable. - - If the original dictionary contains duplicate values, then only one of - the corresponding keys will be returned and the others will be - discarded. This can be prevented by setting ``unique_vals=False``, - causing the inverted keys to be returned in a set. - - Example - ------- - >>> from netharn.initializers._nx_ext.tree_embedding import * # NOQA - >>> dict_ = {'a': 1, 'b': 2} - >>> inverted = invert_dict(dict_) - >>> assert inverted == {1: 'a', 2: 'b'} - - Example - ------- - >>> from netharn.initializers._nx_ext.tree_embedding import * # NOQA - >>> dict_ = OrderedDict([(2, 'a'), (1, 'b'), (0, 'c'), (None, 'd')]) - >>> inverted = invert_dict(dict_) - >>> assert list(inverted.keys())[0] == 'a' - - Example - ------- - >>> from netharn.initializers._nx_ext.tree_embedding import * # NOQA - >>> dict_ = {'a': 1, 'b': 0, 'c': 0, 'd': 0, 'f': 2} - >>> inverted = invert_dict(dict_, unique_vals=False) - >>> assert inverted == {0: {'b', 'c', 'd'}, 1: {'a'}, 2: {'f'}} - """ - if unique_vals: - if isinstance(dict_, OrderedDict): - inverted = OrderedDict((val, key) for key, val in dict_.items()) - else: - inverted = {val: key for key, val in dict_.items()} - else: - # Handle non-unique keys using groups - inverted = defaultdict(set) - for key, value in dict_.items(): - inverted[value].add(key) - inverted = dict(inverted) - return inverted - - -def forest_str(graph, use_labels=True, sources=None, write=None): - """ - Creates a nice utf8 representation of a directed forest - - Parameters - ---------- - graph : nx.DiGraph | nx.Graph - Graph to represent (must be a tree, forest, or the empty graph) - - use_labels : bool - If True will use the "label" attribute of a node to display if it - exists otherwise it will use the node value itself. Defaults to True. - - sources : List - Mainly relevant for undirected forests, specifies which nodes to list - first. If unspecified the root nodes of each tree will be used for - directed forests; for undirected forests this defaults to the nodes - with the smallest degree. - - write : callable - Function to use to write to, if None new lines are appended to - a list and returned. If set to the `print` function, lines will - be written to stdout as they are generated. If specified, - this function will return None. Defaults to None. - - Returns - ------- - str | None : - utf8 representation of the tree / forest - - Example - ------- - >>> import networkx as nx - >>> graph = nx.balanced_tree(r=2, h=3, create_using=nx.DiGraph) - >>> print(forest_str(graph)) - ╙── 0 - ├─╼ 2 - │   ├─╼ 6 - │   │   ├─╼ 14 - │   │   └─╼ 13 - │   └─╼ 5 - │   ├─╼ 12 - │   └─╼ 11 - └─╼ 1 - ├─╼ 4 - │   ├─╼ 10 - │   └─╼ 9 - └─╼ 3 - ├─╼ 8 - └─╼ 7 - - >>> graph = nx.balanced_tree(r=1, h=2, create_using=nx.Graph) - >>> print(nx.forest_str(graph)) - ╟── 1 - ╎   ├── 2 - ╎   └── 0 - """ - import networkx as nx - - printbuf = [] - if write is None: - _write = printbuf.append - else: - _write = write - - if len(graph.nodes) == 0: - _write("╙") - else: - if not nx.is_forest(graph): - raise nx.NetworkXNotImplemented("input must be a forest or the empty graph") - - is_directed = graph.is_directed() - succ = graph.succ if is_directed else graph.adj - - if sources is None: - if is_directed: - # use real source nodes for directed trees - sources = [n for n in graph.nodes if graph.in_degree[n] == 0] - else: - # use arbitrary sources for undirected trees - sources = sorted(graph.nodes, key=lambda n: graph.degree[n]) - - seen = set() - stack = [] - for idx, node in enumerate(sources): - islast_next = idx == 0 - stack.append((node, "", islast_next)) - - while stack: - node, indent, islast = stack.pop() - if node in seen: - continue - seen.add(node) - - # Notes on available box and arrow characters - # https://en.wikipedia.org/wiki/Box-drawing_character - # https://stackoverflow.com/questions/2701192/triangle-arrow - if not indent: - # Top level items (i.e. trees in the forest) get different - # glyphs to indicate they are not actually connected - if islast: - this_prefix = indent + "╙── " - next_prefix = indent + " " - else: - this_prefix = indent + "╟── " - next_prefix = indent + "╎   " - - else: - # For individual forests distinguish between directed and - # undirected cases - if is_directed: - if islast: - this_prefix = indent + "└─╼ " - next_prefix = indent + " " - else: - this_prefix = indent + "├─╼ " - next_prefix = indent + "│   " - else: - if islast: - this_prefix = indent + "└── " - next_prefix = indent + " " - else: - this_prefix = indent + "├── " - next_prefix = indent + "│   " - - if use_labels: - label = graph.nodes[node].get("label", node) - else: - label = node - - _write(this_prefix + str(label)) - - children = [child for child in succ[node] if child not in seen] - for idx, child in enumerate(children, start=1): - islast_next = idx <= 1 - try_frame = (child, next_prefix, islast_next) - stack.append(try_frame) - - if write is None: - # Only return a string if the custom write function was not specified - return "\n".join(printbuf) - - -if __name__ == '__main__': - """ - CommandLine: - python -m netharn.initializers._nx_ext.tree_embedding all - python -m netharn.initializers._nx_ext all - """ - import xdoctest - xdoctest.doctest_module(__file__) diff --git a/netharn/initializers/functional.py b/netharn/initializers/functional.py index 7be96f6e5f96598cdcc90cf8700ad0b5e613e2ce..95f45688e9e6ec8ece41caae45fb0fe4b10dce56 100644 --- a/netharn/initializers/functional.py +++ b/netharn/initializers/functional.py @@ -335,9 +335,7 @@ def load_partial_state(model, model_state_dict, leftover=None, print('CONTRACT') # pair_freq = ub.dict_hist(ub.flatten([tups1, tups2])) - from netharn.initializers._nx_ext.tree_embedding import forest_str - from netharn.initializers._nx_ext.path_embedding import paths_to_otree - print(forest_str(paths_to_otree(other_keys, '.'))) + # print(forest_str(paths_to_otree(other_keys, '.'))) # common_keys = other_keys.intersection(self_keys) # if not common_keys: @@ -810,29 +808,19 @@ def maximum_common_ordered_subpaths(paths1, paths2, sep='.', mode='embedding'): tree1 = paths_to_otree(paths1) tree2 = paths_to_otree(paths2) - # from netharn.initializers._nx_ext.tree_embedding import forest_str + # from netharn.initializers._nx_ext_v2.tree_embedding import forest_str # print(len(tree1.nodes)) # print(len(tree2.nodes)) # print(forest_str(tree1)) # print(forest_str(tree2)) - # if 0: - # DiGM = isomorphism.DiGraphMatcher(tree1, tree2) - # DiGM.is_isomorphic() - # list(DiGM.subgraph_isomorphisms_iter()) - - if 0: - from netharn.initializers import _nx_ext - assert mode == 'embedding' - subtree1, subtree2 = _nx_ext.maximum_common_ordered_tree_embedding(tree1, tree2, node_affinity=node_affinity) + from netharn.initializers import _nx_ext_v2 + if mode == 'embedding': + subtree1, subtree2, value = _nx_ext_v2.maximum_common_ordered_subtree_embedding(tree1, tree2, node_affinity=node_affinity) + elif mode == 'isomorphism': + subtree1, subtree2, value = _nx_ext_v2.maximum_common_ordered_subtree_isomorphism(tree1, tree2, node_affinity=node_affinity) else: - from netharn.initializers import _nx_ext_v2 - if mode == 'embedding': - subtree1, subtree2, value = _nx_ext_v2.maximum_common_ordered_subtree_embedding(tree1, tree2, node_affinity=node_affinity) - elif mode == 'isomorphism': - subtree1, subtree2, value = _nx_ext_v2.maximum_common_ordered_subtree_isomorphism(tree1, tree2, node_affinity=node_affinity) - else: - raise KeyError(mode) + raise KeyError(mode) subpaths1 = [sep.join(node) for node in subtree1.nodes if subtree1.out_degree[node] == 0] subpaths2 = [sep.join(node) for node in subtree2.nodes if subtree2.out_degree[node] == 0] diff --git a/setup.py b/setup.py index 6be4f3cd694fa2d973bb7f4ca13d32f7e85297f1..a74cb1c23b247218237d67b8bace01896c30912c 100755 --- a/setup.py +++ b/setup.py @@ -250,6 +250,9 @@ if __name__ == '__main__': 'tests': parse_requirements('requirements/tests.txt'), }, packages=find_packages(include='netharn.*'), + package_data={ + 'netharn.initializers._nx_ext_v2': ['*.pyx'], + }, license='Apache 2', classifiers=[ # List of classifiers available at: diff --git a/super_setup.py b/super_setup.py index 045fb918f7479fdcd0245b94fd61a2ebb33c50fd..9f415c5a0138f96532c74cea03410a94faeb88b9 100755 --- a/super_setup.py +++ b/super_setup.py @@ -768,7 +768,7 @@ def determine_code_dpath(): DEVEL_REPOS = [ # The util libs { - 'name': 'kwarray', 'branch': 'dev/0.5.12', 'remote': 'public', + 'name': 'kwarray', 'branch': 'dev/0.5.13', 'remote': 'public', 'remotes': {'public': 'git@gitlab.kitware.com:computer-vision/kwarray.git'}, }, { @@ -810,7 +810,7 @@ DEVEL_REPOS = [ # netharn - training harness { - 'name': 'netharn', 'branch': 'dev/0.5.11', 'remote': 'public', + 'name': 'netharn', 'branch': 'dev/0.5.12', 'remote': 'public', 'remotes': {'public': 'git@gitlab.kitware.com:computer-vision/netharn.git'}, }, ]