Source code for ggfm.data.random_walk

import time
import random
from sklearn.model_selection import train_test_split

from warnings import filterwarnings
filterwarnings("ignore")





[docs]def get_type_id(graph): r""" Statistically analyze the type_degin_ids and graph_node_type of graphs. Parameters ---------- graph: class:`ggfm.data.Graph` Target graph. Returns ------- type_begin_ids: list The begin index of each node type, type_begin_ids are consistent with graph_node_type. graph_node_type: list Graph node types. """ graph_node_name = {} graph_node_type = graph.get_types() for i in range(len(graph_node_type)): attr = "name" if graph_node_type[i] == "paper": attr = "title" graph_node_name[graph_node_type[i]] = graph.node_feature[graph_node_type[i]][attr].tolist() type_begin_ids = [0, ] for i in range(1, len(graph_node_type)): type_begin_ids.append(type_begin_ids[i-1]+len(graph_node_name[graph_node_type[i-1]])) return type_begin_ids, graph_node_type
[docs]def random_walk_based_corpus_construction(data_dir, relations, alpha=0.05, path_length=1000000, path_num=450000): r""" Construct link.dat and node.dat. Parameters ---------- data_dir: str Data directory for loading link.dat and node.dat, also for saving output.txt, rw_train_corpus.txt and rw_valid_corpus.txt. relations: list Relations for all edge types. alpha: str, optional Each path will terminate sampling with a probability of alpha. (default: :obj:`0.05`) path_length: int, optional Sampling length of each path. (default: :obj:`1000000`) path_num: int, optional Number of sampled paths. (default: :obj:`450000`) """ op1, op2, op3 = [], [], [] with open(data_dir + 'node.dat','r') as file: for line in file: node_id, node_name, node_type = line.split('\t') op1.append(int(node_id)) op2.append(node_name) op3.append(int(node_type)) G=[[] for i in range(len(op3))] with open(data_dir + 'link.dat', 'r') as file: for line in file: src, dst, edge_type = line.split('\t') G[int(src)].append([int(dst), int(edge_type)]) line_idx = op1 rand = random.Random() patient_patient_path = [] dic = {} start_time = time.time() for line in range(path_num): if line % 10000 == 0: current_time = time.time() dual_time = current_time - start_time print(f"having sampling {line} lines and spent {dual_time}...") temp_path = [] start_path = rand.choice(line_idx) temp_path.append([start_path,-1]) dic[start_path] = 1 for i in range(path_length): cur = temp_path[-1][0] if (len(G[cur]) > 0): if rand.random() >= alpha: cur_path = rand.choice(G[cur]) temp_path.append(cur_path) dic[cur_path[0]] = 1 else: break else: break if (len(temp_path) >= 2): patient_patient_path.append(temp_path) line_name = {} for i in range(len(relations)): line_name[i] = relations[i] with open(data_dir + 'output.txt', 'w') as f: for i in range(len(patient_patient_path)): print(op2[patient_patient_path[i][0][0]],line_name[patient_patient_path[i][1][1]],op2[patient_patient_path[i][1][0]],end='',file=f) for j in range(1,len(patient_patient_path[i])-2): print(' '+line_name[patient_patient_path[i][j+1][1]],op2[patient_patient_path[i][j+1][0]],end='',file=f) if(len(patient_patient_path[i])>2): print(' '+line_name[patient_patient_path[i][-1][1]],op2[patient_patient_path[i][-1][0]],end='',file=f) print("\n",end='',file=f) with open(data_dir + 'output.txt', 'r') as file: corpus = [line.rstrip("\n") for line in file.readlines()] print(f"length of corpus: {len(corpus)}") print("corpus[0]: ") print(corpus[0]) train_text, val_text = train_test_split(corpus, test_size=0.15, random_state=42) with open(data_dir + 'rw_train_corpus.txt', 'w') as file: for paragraph in train_text: file.write(paragraph + "\n") with open(data_dir + 'rw_val_corpus.txt', 'w') as file: for paragraph in val_text: file.write(paragraph + "\n") print(f"length of train_corpus: {len(train_text)}") print(f"length of valid_corpus: {len(val_text)}") print("train_corpus and valid_corpus have been saved.")