Source code for common.preprocessing

import sys
from gurobipy import tuplelist
import random
from collections import OrderedDict
import networkx as nx
import copy
import data as d


[docs]class Preprocess: def __init__(self): return # done def preprocess_with_weights(self, data, k): data_components = list() small_components = set() g = self.make_graph(data) components = nx.connected_components(g) for c in components: if len(c) < k: small_components |= {c} continue print "connected component" nodes_to_remove = set() nodes_degree_one = set() nodes_covering_less = set() nodes_noncovering = set() for i in c: if i in small_components: continue if i in data.node_mutations: for u in c: if u in data.node_mutations and i != u: if data.node_neighbors[i] <= data.node_neighbors[u]\ and data.node_mutations[i] <= data.node_mutations[u]: # 4: node u strictly "better" if data.weights[i] <= data.weights[u]: nodes_covering_less.add(i) break else: # 1: nodes of degree 1 not covering any patients if len(data.node_neighbors[i]) == 1: nodes_degree_one.add(i) else: for u in c: if data.node_neighbors[i] <= data.node_neighbors[u]: if u in data.node_mutations: # 3: more useful (covering) node that also connects nodes nodes_noncovering.add(i) break elif data.node_neighbors[i] != data.node_neighbors[u] or u < i: # 2: another, smaller index node that has same neighbors, and covers nothing nodes_noncovering.add(i) break print "before, in component : %d " % len(c) print "degree 1, no cover: %d " % len(nodes_degree_one) print "exists better node: %d " % len(nodes_covering_less) print "no cover, same nbs: %d " % len(nodes_noncovering) print "small components: %d " % len(small_components) nodes_to_remove |= small_components nodes_to_remove |= nodes_degree_one nodes_to_remove |= nodes_covering_less nodes_to_remove |= nodes_noncovering print "all nodes to remove: %d " % len(nodes_to_remove) if len(c) - len(nodes_to_remove) >= k: # # nodes_to_keep = [x for x in c if x not in nodes_to_remove] # print "nodes after prep: %d " % len(nodes_to_keep) data_components.append(data.data_from_nodelist(c, nodes_to_remove, weights=data.weights)) return data_components # # # PREPROCESS # done def make_graph(self, data): graph = nx.Graph() graph.add_nodes_from(data.nodes) for [u, w] in data.interactions: graph.add_edge(u, w) return graph # done def preprocess(self, data, k): if data.weights: return data.preprocess_with_weights(k) data_components = list() small_components = set() g = self.make_graph(data) components = nx.connected_components(g) for c in components: if len(c) < k: small_components |= set(c) continue nodes_to_remove = set() nodes_degree_one = set() nodes_covering_less = set() nodes_noncovering = set() # TEST # data_components.append(self.data_from_nodelist(data, c, {'8152'})) # return data_components # TEST for i in c: if i in data.node_mutations: for u in c: if u in data.node_mutations and i != u: if data.node_neighbors[i] <= data.node_neighbors[u]\ and data.node_mutations[i] <= data.node_mutations[u]: # 4: node u strictly "better" if data.node_neighbors[i] != data.node_neighbors[u] \ or data.node_mutations[i] != data.node_mutations[u]: nodes_covering_less.add(i) break # 5: u at least as good and with smaller index elif u < i: nodes_covering_less.add(i) break else: # 1: nodes of degree 1 not covering any patients if len(data.node_neighbors[i]) == 1: nodes_degree_one.add(i) else: for u in c: if data.node_neighbors[i] <= data.node_neighbors[u]: if u in data.node_mutations: # 3: more useful (covering) node that also connects nodes nodes_noncovering.add(i) break elif data.node_neighbors[i] != data.node_neighbors[u] or u < i: # 2: another, smaller index node that has same neighbors, and covers nothing nodes_noncovering.add(i) break print "before, in component : %d " % len(c) print "degree 1, no cover: %d " % len(nodes_degree_one) print "exists better node: %d " % len(nodes_covering_less) print "no cover, same nbs: %d " % len(nodes_noncovering) print "small components: %d " % len(small_components) nodes_to_remove |= small_components nodes_to_remove |= nodes_degree_one nodes_to_remove |= nodes_covering_less nodes_to_remove |= nodes_noncovering print "all nodes to remove: %d " % len(nodes_to_remove) if len(c) - len(nodes_to_remove) >= k: # nodes_to_keep = [x for x in c if x not in nodes_to_remove] # print "nodes after prep: %d " % len(nodes_to_keep) data_components.append(self.data_from_nodelist(data, c, nodes_to_remove)) return data_components # done def data_from_nodelist(self, data, all_nodes, nodes_to_remove=set(), weights=None): if not nodes_to_remove: return self new_data = d.Data() print new_data # sys.exit(0) component_patient_set = set() for i in all_nodes: if i in nodes_to_remove: continue new_data.nodes.append(i) # update interactions if i in data.node_neighbors: for u in data.node_neighbors[i]: if u not in nodes_to_remove: # save interaction only once if i < u: new_data.interactions.append((i, u)) # update auxiliary adjacency list data structure if i in new_data.node_neighbors: new_data.node_neighbors[i].add(u) else: new_data.node_neighbors[i] = {u} if weights: new_data.weights[i] = weights[i] # update mutations if i in data.node_mutations: new_data.node_mutations[i] = data.node_mutations[i] for p in new_data.node_mutations[i]: new_data.mutations.append((p, i)) component_patient_set.add(p) # print new_data.nodes # for i in new_data.node_mutations.keys(): # print i,i in new_data.nodes new_data.patients = list(component_patient_set) return new_data
# # TODO ? # def preprocess_genes(self, k): # preprocessed = dict() # # connected_components = list() # data_components = list() # small_components = set() # g = self.make_graph(data) # components = nx.connected_components(g) # for c in components: # if len(c) < k: # small_components |= set(c) # else: # connected_components.append(c) # for c in connected_components: # nodes_to_remove = set() # nodes_degree_one = set() # nodes_covering_less = set() # nodes_noncovering = set() # # for i in c: # if i in small_components: # continue # if i in data.node_mut: # for u in c: # if u in data.node_mut and i != u: # if data.node_neighbors[i] <= data.node_neighbors[u]\ # and data.node_mut[i] <= data.node_mut[u]: # # 4: node u strictly "better" # if data.node_neighbors[i] != data.node_neighbors[u] \ # or data.node_mut[i] != data.node_mut[u]: # nodes_covering_less.add(i) # # if u in preprocessed: # # preprocessed[u].add(i) # # else: # # preprocessed[i] = {i} # # break # # 5: u at least as good and with smaller index # elif u < i: # nodes_covering_less.add(i) # if u in preprocessed: # preprocessed[u].add(i) # else: # preprocessed[u] = {i} # break # else: # # 1: nodes of degree 1 not covering any patients # if len(data.node_neighbors[i]) == 1: # nodes_degree_one.add(i) # else: # for u in c: # if data.node_neighbors[i] <= data.node_neighbors[u]: # if u in data.node_mut: # # 3: more useful (covering) node that also connects nodes # nodes_noncovering.add(i) # break # elif data.node_neighbors[i] != data.node_neighbors[u] or u < i: # # 2: another, smaller index node that has same neighbors, and covers nothing # nodes_noncovering.add(i) # break # # print "before, in component : %d " % len(c) # print "degree 1, no cover: %d " % len(nodes_degree_one) # print "exists better node: %d " % len(nodes_covering_less) # print "no cover, same nbs: %d " % len(nodes_noncovering) # print "small components: %d " % len(small_components) # return preprocessed