代码拉取完成,页面将自动刷新
from scipy.sparse import csr_matrix
import numpy as np
import pickle
root = 'source/'
# 选择小说
with open(root+'鸳鸯刀.txt', "r", encoding="utf-8") as file:
data = file.readlines()
data=''.join(data)
# Get unique chars
chars = list(set(data))
# Get doc length and charset size
data_size, vocab_size = len(data), len(chars)
print(f'data has {data_size} characters, {vocab_size} unique.')
char_to_ix = {ch:i for i,ch in enumerate(chars)}
ix_to_char = {i:ch for i,ch in enumerate(chars)}
X_train = csr_matrix((len(data), len(chars)), dtype=np.int)
char_id = np.array([chars.index(c) for c in data])
X_train[np.arange(len(data)), char_id] = 1
y_train = np.roll(char_id,-1)
print(X_train)
print(type(y_train))
with open(root + 'X_train.pickle', 'wb') as handle:
pickle.dump(X_train, handle, protocol=2)
with open(root + 'y_train.pickle', 'wb') as handle:
pickle.dump(y_train, handle, protocol=2)
with open(root + 'chars.pickle', 'wb') as handle:
pickle.dump(chars, handle, protocol=2)
with open(root + 'vocab_size.pickle', 'wb') as handle:
pickle.dump(vocab_size, handle, protocol=2)
此处可能存在不合适展示的内容,页面不予展示。您可通过相关编辑功能自查并修改。
如您确认内容无涉及 不当用语 / 纯广告导流 / 暴力 / 低俗色情 / 侵权 / 盗版 / 虚假 / 无价值内容或违法国家有关法律法规的内容,可点击提交进行申诉,我们将尽快为您处理。