加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
文件
克隆/下载
utils_ml.py 6.72 KB
一键复制 编辑 原始数据 按行查看 历史
ArvinLeigh 提交于 2023-02-28 04:10 . 新增cora_ml数据集处理代码
import numpy as np
import scipy.sparse as sp
import torch
import torch.nn.functional as F
import networkx as nx
import sys
import pickle
from models import GCN, GAT, SpGAT, GCN_T
import dgl
def load_ml(dataset_str, labelrate):
names = ['x', 'y', 'tx', 'ty', 'allx', 'ally', 'graph']
objects = []
for i in range(len(names)):
with open("data/coraml/ind.{}.{}".format(dataset_str, names[i]), 'rb') as f:
if sys.version_info > (3, 0):
objects.append(pickle.load(f, encoding='latin1'))
else:
objects.append(pickle.load(f))
x, y, tx, ty, allx, ally, graph = tuple(objects)
g = nx.from_dict_of_lists(graph)
g = dgl.from_networkx(g)
test_idx_reorder = parse_index_file("data/coraml/ind.{}.test.index".format(dataset_str)) # 读取test.index中的每一行,并将其中正数保存
test_idx_range = np.sort(test_idx_reorder)
features = sp.vstack((allx, tx)).tolil() # 将测试集和训练集中有标签和无标签节点的特征组合成lil形式保存
features[test_idx_reorder, :] = features[test_idx_range, :] # 因为vstack将test features放在matrix下方,test feature的索引发生变化为此需要调整
# features = preprocess_features(features)
coo = features.tocoo()
values = coo.data
indices = torch.LongTensor([coo.row, coo.col])
shape = coo.shape
features = torch.sparse_coo_tensor(indices, values, shape)
features = features.to_dense()
# features = torch.tensor(features) # (2995, 2879), test_idx_reorder is test set indices
adj = nx.adjacency_matrix(nx.from_dict_of_lists(graph))
adj = preprocess_adj(adj)
adj = sparse_mx_to_torch_sparse_tensor(adj)
labels = np.vstack((ally, ty)) # (2995,7)
labels[test_idx_reorder, :] = labels[test_idx_range, :] # (1000, 7)
# test_labels = torch.tensor(labels[test_idx_reorder, :]) 测试集labels
# labels = torch.tensor(labels)
# labels = torch.argmax(labels, dim=1)
idx_test = torch.tensor(test_idx_range.tolist()) # torch.Size([1000])
idx_train = torch.tensor(range(len(y))) # torch.Size([500]),训练集中有标签的样本个数500
idx_val = torch.tensor(range(len(y), len(y)+500)) # torch.Size([500])
# train_mask = np.zeros(labels.shape[0])
# val_mask = np.zeros(labels.shape[0])
# test_mask = np.zeros(labels.shape[0])
# train_mask[0:140] = 1
# val_mask[300:800] = 1
# test_mask[1995:] = 1
# train_mask = np.array(train_mask, dtype=bool)
# val_mask = np.array(val_mask, dtype=bool)
# test_mask = np.array(test_mask, dtype=bool)
# train_mask = torch.tensor(train_mask)
# val_mask = torch.tensor(val_mask)
# test_mask = torch.tensor(test_mask)
# if labelrate != 20:
# labelrate -= 20
# nclass = 7
# start = int(torch.where(val_mask==True)[0][-1] + 1)
# train_mask[start:start+labelrate*nclass] = True
train_mask = sample_mask(idx_train, labels.shape[0]) # (2995,7) 按照labelrate比例进行mask
val_mask = sample_mask(idx_val, labels.shape[0]) # (2995,7)
test_mask = sample_mask(idx_test, labels.shape[0]) # (2995,7)
y_train = np.zeros(labels.shape)
y_val = np.zeros(labels.shape)
y_test = np.zeros(labels.shape)
y_train[train_mask, :] = labels[train_mask, :] # 在y_train[train_mask, :]中保存mask的节点的真实label
y_val[val_mask, :] = labels[val_mask, :]
y_test[test_mask, :] = labels[test_mask, :]
labels = torch.tensor(labels)
labels = torch.argmax(labels, dim=1)
train_mask = torch.tensor(train_mask)
val_mask = torch.tensor(val_mask)
test_mask = torch.tensor(val_mask)
# y_train = torch.argmax(torch.tensor(y_train), dim=1)
# print("y_trian shape is:", y_train.size())
# y_val = torch.argmax(torch.tensor(y_val), dim=1)
# y_test = torch.argmax(torch.tensor(y_test), dim=1)
return adj, features, labels, train_mask, val_mask, test_mask, g
def parse_index_file(filename):
"""Parse index file."""
index = []
for line in open(filename):
index.append(int(line.strip()))
return index
# def sample_mask(idx, l, labelrate):
# """Create mask."""
# mask = np.zeros(l)
# a = int(len(idx) + labelrate*7 )
# mask[0:a] = 1
# return np.array(mask, dtype=np.bool)
def sample_mask(idx, l):
"""Create mask."""
mask = np.zeros(l)
mask[idx] = 1
return np.array(mask, dtype=np.bool)
def preprocess_adj(adj, with_ego=True):
"""Preprocessing of adjacency matrix for simple GCN model and conversion
to tuple representation."""
if with_ego:
adj_normalized = normalize_adj(adj + sp.eye(adj.shape[0]))
else:
adj_normalized = normalize_adj(adj)
return adj_normalized
def normalize_adj(adj):
"""Symmetrically normalize adjacency matrix."""
adj = sp.coo_matrix(adj)
rowsum = np.array(adj.sum(1)) # D
d_inv_sqrt = np.power(rowsum, -0.5).flatten() # D^-0.5
d_inv_sqrt[np.isinf(d_inv_sqrt)] = 0.
d_mat_inv_sqrt = sp.diags(d_inv_sqrt) # D^-0.5
return adj.dot(d_mat_inv_sqrt).transpose().dot(d_mat_inv_sqrt).tocoo() # D^-0.5AD^0.5
def sparse_to_tuple(sparse_mx):
"""Convert sparse matrix to tuple representation."""
def to_tuple(mx):
if not sp.isspmatrix_coo(mx):
mx = mx.tocoo()
coords = np.vstack((mx.row, mx.col)).transpose()
values = mx.data
shape = mx.shape
return coords, values, shape
if isinstance(sparse_mx, list):
for i in range(len(sparse_mx)):
sparse_mx[i] = to_tuple(sparse_mx[i])
else:
sparse_mx = to_tuple(sparse_mx)
return sparse_mx
def preprocess_features(features, noTuple=False):
"""Row-normalize feature matrix and convert to tuple representation"""
rowsum = np.array(features.sum(1))
r_inv = np.power(rowsum, -1).flatten()
r_inv[np.isinf(r_inv)] = 0.
r_mat_inv = sp.diags(r_inv)
features = r_mat_inv.dot(features)
if noTuple: return features
else: return sparse_to_tuple(features)
def sparse_mx_to_torch_sparse_tensor(sparse_mx):
"""Convert a scipy sparse matrix to a torch sparse tensor."""
sparse_mx = sparse_mx.tocoo().astype(np.float32)
indices = torch.from_numpy(
np.vstack((sparse_mx.row, sparse_mx.col)).astype(np.int64))
values = torch.from_numpy(sparse_mx.data)
shape = torch.Size(sparse_mx.shape)
return torch.sparse.FloatTensor(indices, values, shape)
adj, features, labels, train_mask, val_mask, test_mask, g = load_ml('cora_ml', 20)
# print("labels :", labels)
print("y_trian is", labels)
Loading...
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化