Source code for ggfm.data.hgsampling

import numpy as np
from collections import defaultdict


[docs]def feature_extractor(layer_data, graph): r"""`"GPT-GNN: Generative Pre-Training of Graph Neural Networks" <https://arxiv.org/abs/2006.15437>`_ paper. Extract relevent features. Parameters ---------- layer_data: dict Sampled node indexes for each node type. graph: class:`ggfm.data.Graph` Target graph. """ feature = {} times = {} indxs = {} texts = [] for _type in layer_data: if len(layer_data[_type]) == 0: continue idxs = np.array(list(layer_data[_type].keys())) # origin_indxs tims = np.array(list(layer_data[_type].values()))[:,1] # times if 'node_emb' in graph.node_feature[_type]: feature[_type] = np.array(list(graph.node_feature[_type].loc[idxs, 'node_emb']), dtype=float) else: # intialize as 0 feature[_type] = np.zeros([len(idxs), 400]) # 400, 768, 1 feature[_type] = np.concatenate((feature[_type], list(graph.node_feature[_type].loc[idxs, 'emb']),\ np.log10(np.array(list(graph.node_feature[_type].loc[idxs, 'citation'])).reshape(-1, 1) + 0.01)), axis=1) times[_type] = tims indxs[_type] = idxs if _type == 'paper': attr = np.array(list(graph.node_feature[_type].loc[idxs, 'title']), dtype=str) return feature, times, indxs, attr
[docs]def sample_subgraph(graph, time_range, sampled_depth = 2, sampled_number = 8, inp=None): r"""`"GPT-GNN: Generative Pre-Training of Graph Neural Networks" <https://arxiv.org/abs/2006.15437>`_ paper. Sample Sub-Graph based on the connection of other nodes with currently sampled nodes Budgets are maintained for each node type, indexed by <node_id, time>. Currently sampled nodes are stored in layer_data. After nodes are sampled, the sampled adjacancy matrix are constructed. Parameters ---------- graph: class:`ggfm.data.Graph` Target graph. time_range: list Time range of target nodes. sampled_depth: int, optional Sampled depth. (default: :obj:`2`) sampled_number: int, optional Sampled number. (default: :obj:`8`) inp: dict Input data for sampling. `inp = {target_type: samp_target_nodes}` """ layer_data = defaultdict( #target_type lambda: {} # {target_id: [ser, time]} ) budget = defaultdict( #source_type lambda: defaultdict( # source_id lambda: [0., 0] # [sampled_score, time] )) ''' For each node being sampled, we find out all its neighborhood, adding the degree count of these nodes in the budget. Note that there exist some nodes that have many neighborhoods (such as fields, venues), for those case, we only consider ''' def add_budget(te, target_id, target_time, layer_data, budget): for source_type in te: # source_type tes = te[source_type] # relation for relation_type in tes: # such as: rev_PV_Conference, rev_PV_Journal if relation_type == 'self' or target_id not in tes[relation_type]: continue adl = tes[relation_type][target_id] # {source_id: year, } if len(adl) < sampled_number: sampled_ids = list(adl.keys()) else: sampled_ids = np.random.choice(list(adl.keys()), sampled_number, replace = False) for source_id in sampled_ids: source_time = adl[source_id] if source_time == None: source_time = target_time if source_time > np.max(list(time_range.keys())) or source_id in layer_data[source_type]: continue budget[source_type][source_id][0] += 1. / len(sampled_ids) # score budget[source_type][source_id][1] = source_time # time ''' First adding the sampled nodes then updating budget. ''' # inp = {target_type: samp_target_nodes} # inp['paper'].shape = (batch_size, 2) [[id, year], [id, year], ] for _type in inp: # paper for _id, _time in inp[_type]: # id transfer layer_data[_type][_id] = [len(layer_data[_type]), _time] # id -> cur_length // layer_data: {'paper': {id: [cur_length, year], }} for _type in inp: # sampling nodes for each source type of each target type te = graph.edge_list[_type] # such as: paper_venue, paper_paper, paper_field, paper_author for _id, _time in inp[_type]: add_budget(te, _id, _time, layer_data, budget) ''' We recursively expand the sampled graph by sampled_depth. Each time we sample a fixed number of nodes for each budget, based on the accumulated degree. ''' for layer in range(sampled_depth): sts = list(budget.keys()) for source_type in sts: te = graph.edge_list[source_type] keys = np.array(list(budget[source_type].keys())) if sampled_number > len(keys): sampled_ids = np.arange(len(keys)) else: score = np.array(list(budget[source_type].values()))[:,0] ** 2 score = score / np.sum(score) sampled_ids = np.random.choice(len(score), sampled_number, p = score, replace = False) sampled_keys = keys[sampled_ids] ''' First adding the sampled nodes then updating budget. ''' for k in sampled_keys: layer_data[source_type][k] = [len(layer_data[source_type]), budget[source_type][k][1]] # layer_data[source_type] {id: [cur_length, time], } for k in sampled_keys: add_budget(te, k, budget[source_type][k][1], layer_data, budget) budget[source_type].pop(k) # Prepare feature, time and adjacency matrix for the sampled graph, indxs are the origin indexes, texts are the title information of papers feature, times, indxs, texts = feature_extractor(layer_data, graph) edge_list = defaultdict( # target_type lambda: defaultdict( # source_type lambda: defaultdict( # relation_type lambda: [] # [target_id, source_id] ))) for _type in layer_data: # {type: {id: [cur_id, year], }, } for _key in layer_data[_type]: _ser = layer_data[_type][_key][0] # cur_id edge_list[_type][_type]['self'] += [[_ser, _ser]] # add self-loop ''' Reconstruct sampled adjacancy matrix by checking whether each link exist in the original graph ''' for target_type in graph.edge_list: te = graph.edge_list[target_type] tld = layer_data[target_type] # {type: {id: [cur_id, year], }, } for source_type in te: tes = te[source_type] sld = layer_data[source_type] for relation_type in tes: # relation tesr = tes[relation_type] # target_id for target_key in tld: # sampled target_ids if target_key not in tesr: continue target_ser = tld[target_key][0] # cur_id for source_key in tesr[target_key]: # Check whether each link (target_id, source_id) exist in original adjacancy matrix if source_key in sld: source_ser = sld[source_key][0] edge_list[target_type][source_type][relation_type] += [[target_ser, source_ser]] return feature, times, edge_list, indxs, texts