加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
文件
克隆/下载
model.py 22.99 KB
一键复制 编辑 原始数据 按行查看 历史
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547
from transformers import AutoModelForCausalLM, AutoTokenizer
from cpm9g_tokenizer import CPM9GTokenizer
from typing import List
import torch
from auto_gptq import AutoGPTQForCausalLM
import os
import time
import numpy as np
import faiss
import pandas as pd
from sentence_transformers import SentenceTransformer
import re
from dprint import dprint
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
cur_dir = os.path.split(os.path.abspath(__file__))[0]
device = "cuda"
df = pd.read_excel('database.xlsx', header=0)
queryModel = SentenceTransformer(f"{cur_dir}/paraphrase-multilingual-MiniLM-L12-v2")
class Model:
def __init__(self, name: str):
self.name = name
self.model = None
self.tokenizer = None
self.except_count = 0
def load_model(self) -> None:
pass
def unload_model(self) -> None:
if self.model != None:
print("unload model {}".format(self.name))
del self.model
self.model = None
def judge(self, input: str) -> str:
return "none"
def inference(self, data: List[str], prompt: str) -> str:
return "hello"
def wps_inference(self, question: str) -> str:
return "hello, wps"
class CPM9GGPTQQuantModel(Model):
def __init__(self, name: str):
super().__init__(name)
def process_exception(self) -> str:
self.except_count += 1
if self.except_count < 3:
output = "none: 我出现异常了"
else:
output = "none: 还是有异常, 建议重启服务"
return output
def load_model(self) -> None:
self.unload_model()
print(f"{self.name}模型加载中...")
start_time = time.time()
self.model = AutoGPTQForCausalLM.from_quantized(f"{cur_dir}/hf-8b-v2-gptq/4bits-128g-desc_act_True-damp_0.01",device="cuda:0")
self.model.eval()
self.tokenizer = CPM9GTokenizer(f"{cur_dir}/hf-8b-v2-gptq/4bits-128g-desc_act_True-damp_0.01/vocabs.txt")
end_time = time.time()
elapsed_time = np.round(end_time - start_time, decimals=3)
print(f"模型加载时间: {elapsed_time} 秒")
def judge(self, question: str) -> str:
if self.model == None:
self.load_model()
start_time = time.time()
try:
input_ids = torch.tensor([[self.tokenizer.bos_id] + self.tokenizer.encode(question)[:4096]]).cuda()
output = self.model.generate(inputs=input_ids, max_new_tokens=2048, do_sample=False, num_beams=1)[0].tolist()
except Exception:
output = self.process_exception()
text = re.sub(r'[^\u4e00-\u9fa5a-zA-Z0-9 ;]', '', self.tokenizer.decode(output).split('<AI>')[-1][:-4])
end_time = time.time()
elapsed_time = np.round(end_time - start_time, decimals=3)
print(f"模型(judge)推理时间: {elapsed_time} 秒")
return text
def inference(self, data: List[str], prompt: str) -> str:
if self.model == None:
self.load_model()
start_time = time.time()
str = "".join(data)
input = prompt + str + "<AI>"
try:
input_ids = torch.tensor([[self.tokenizer.bos_id] + self.tokenizer.encode(input)[:4096]]).cuda()
output = self.model.generate(inputs=input_ids, max_new_tokens=2048, do_sample=False, num_beams=1)[
0].tolist()
except Exception:
output = self.process_exception()
text = self.tokenizer.decode(output)
end_time = time.time()
elapsed_time = np.round(end_time - start_time, decimals=3)
output = text.split('<AI>')[-1][:-4]
print(f"模型(ask)推理时间: {elapsed_time} 秒")
print(output)
return output
def wps_inference(self, question: str) -> str:
question = question.replace('\r', '\n')
input = f"<用户>{question}<AI>"
if self.model == None:
self.load_model()
start_time = time.time()
try:
input_ids = torch.tensor([[self.tokenizer.bos_id] + self.tokenizer.encode(input)]).cuda()
output = self.model.generate(inputs=input_ids, max_new_tokens=5130, do_sample=False, num_beams=1)[0].tolist()
except Exception:
output = self.process_exception()
end_time = time.time()
text = self.tokenizer.decode(output)
print("text+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")
print(text)
print("+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")
elapsed_time = end_time - start_time
elapsed_time = np.round(elapsed_time, decimals=3)
output = text.split('<AI>')[-1][:-4]
dprint(f"模型(wps)推理时间: {elapsed_time} 秒")
print("output+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")
print(output)
print("+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")
return output
class QueryDatabase:
def __init__(self, model):
self.model = model
def query(self, job):
# 1. 读取数据(向量数据库)
texts = df.iloc[:, 0].values
# 2. 文本向量化
text_vectors = queryModel.encode(texts, convert_to_numpy=True)
d = text_vectors.shape[1] # 向量的维度
# 3. 创建索引
index = faiss.IndexFlatL2(d) # 使用L2距离的平面索引
# 4. 添加数据到索引中
index.add(text_vectors)
# 5. 查询数据
query_vector = queryModel.encode([job], convert_to_numpy=True)
distances, indices = index.search(query_vector, 1)
# 阀值
stresct = 2
temp = 0
# 查看阀值和预值
print(distances[0][0],stresct,job)
if distances[0][0] > stresct:
print("未在数据库中匹配该向量!")
if "查找" in job:
temp = 1
else:
return "none"
text = texts[indices[0][0]]
database_text = df[df['Key'] == text].iloc[:, :].values.tolist()[0]
directive = database_text[1]
param = database_text[2]
if param == '不确定参数':
return "next"
if param == '特殊例子':
return directive
if temp == 1 or directive.startswith('find'):
judge_directive_template = '''提取"{question}"这句话中的时间参数,如“1天、3天、一周、一个月、半个月、一天、一年、2年、三周”等这一类的时间参数。结果转换成以天为单位的时间,最终返回的格式为'时间参数:###天'。回答:<AI>'''
directive_input = judge_directive_template.format(question=job, directive=directive)
text = self.model.judge(directive_input)
output = re.sub(r'[^0-9]', '', text.split('<AI>')[-1].split(':')[-1])
print(output, text)
if output == '':
output = 3
directive += ' -{}'.format(output)
return directive
class MiniCPM3_4BModel(Model):
def __init__(self, name: str):
super().__init__(name)
def process_exception(self) -> str:
self.except_count += 1
if self.except_count < 3:
output = "我出现异常了"
else:
output = "还是有异常, 建议重启服务"
return output
def load_model(self) -> None:
self.unload_model()
print(f"{self.name}模型加载中...")
start_time = time.time()
self.model = AutoModelForCausalLM.from_pretrained(cur_dir + "/MiniCPM3-4B", torch_dtype=torch.bfloat16,
device_map=device, trust_remote_code=True)
self.model.eval()
self.tokenizer = AutoTokenizer.from_pretrained(cur_dir + "/MiniCPM3-4B", trust_remote_code=True)
end_time = time.time()
elapsed_time = np.round(end_time - start_time, decimals=3)
print(f"模型加载时间: {elapsed_time} 秒")
def judge(self, question: str) -> str:
if self.model == None:
self.load_model()
start_time = time.time()
try:
model_inputs = self.tokenizer.apply_chat_template([{"role": "user", "content": question}],
return_tensors="pt", add_generation_prompt=True).to(
device)
model_outputs = self.model.generate(model_inputs, max_new_tokens=2048, top_p=0.7, temperature=0.7)
output_token_ids = [model_outputs[i][len(model_inputs[i]):] for i in range(len(model_inputs))]
output = self.tokenizer.batch_decode(output_token_ids, skip_special_tokens=True)[0]
except:
output = self.process_exception()
end_time = time.time()
elapsed_time = np.round(end_time - start_time, decimals=3)
print(f"模型(minicpm)推理时间: {elapsed_time} 秒")
return output
def inference(self, data: List[str], prompt: str) -> str:
if self.model == None:
self.load_model()
start_time = time.time()
input = prompt + "".join(data) + "<AI>"
text = self.judge(input)
end_time = time.time()
elapsed_time = np.round(end_time - start_time, decimals=3)
output = text.split('<AI>')[-1]
print(f"模型(ask)推理时间: {elapsed_time} 秒")
print(output)
return output
# class CPM9GModel(Model):
# def load_model(self) -> None:
# model_path = "/data/public/9G/checkpoints-epoch-1"
# dprint(f"{self.name}模型加载中...")
# start_time = time.time()
# model_config = json.load(open(f"{model_path}/config.json", 'r'))
# model_config["new_vocab"] = True
# self.model = CPM9G(
# "",
# f"{model_path}/vocabs.txt",
# 0,
# memory_limit = (30 << 29)+(30 << 28),
# model_config = model_config,
# load_model = False,
# )
# self.model.load_model_pt(f"{model_path}/cpm9g-8b-sft-epoch-1.pt")
# model_time = time.time()
# elapsed_time = model_time - start_time
# elapsed_time = np.round(elapsed_time, decimals=3)
# dprint(f"模型加载时间: {elapsed_time} 秒")
# def inference(self, data: List[str], prompt: str) -> str:
# if self.model == None:
# self.load_model()
# start_time = time.time()
# input = "".join(data)
# res = self.model.inference(input+"<AI>", max_length=40960)
# model_time = time.time()
# elapsed_time = model_time - start_time
# elapsed_time = np.round(elapsed_time, decimals=3)
# output = res['result'].split('<AI>')[-1]
# # dprint(f"模型推理时间: {elapsed_time} 秒,输入长度:{len(data)}, 输出长度:{len(output)}/{len(res['result'])}")
# return output
# class CPM9GSMQuantModel(Model):
# def __init__(self, name: str):
# self.name = name
# self.model = None
# model_path = './hf-smooth-quant-05-pertoken'
# self.tokenizer = CPM9GTokenizer(f"{model_path}/vocabs.txt")
# def load_model(self) -> None:
# self.unload_model()
# dprint(f"{self.name}模型加载中...")
# start_time = time.time()
# model_path = './hf-smooth-quant-05-pertoken'
# config_path = f"{model_path}/quant_config.json"
# quant_config = parse_quant_config(config_path)
# self.model = Int8LlamaForCausalLM.from_pretrained(model_path, quant_config, attn_implementation="eager", device_map='cuda')
# # self.tokenizer = CPM9GTokenizer(f"{model_path}/vocabs.txt")
# model_time = time.time()
# elapsed_time = model_time - start_time
# elapsed_time = np.round(elapsed_time, decimals=3)
# dprint(f"模型加载时间: {elapsed_time} 秒")
# def inference(self, data: List[str], prompt: str) -> str:
# if self.model == None:
# self.load_model()
# start_time = time.time()
# # if len(data) > 1:
# input = "".join(data)
# # else:
# # input = data[0]
# # # 找到第一次出现的位置
# # position = str.find("<用户>")
# # if position != -1:
# # # 计算插入位置
# # insert_position = position + len("<用户>")
# # # 进行字符串插入
# # input = str[:insert_position] + prompt + str[insert_position:]
# # else:
# # input = str
# input_ids = torch.tensor([[self.tokenizer.bos_id] + self.tokenizer.encode(prompt+input+"<AI>")]).cuda()
# output = self.model.generate(input_ids, max_length=1024)[0].tolist()
# # model_time = time.time()
# # elapsed_time = model_time - start_time
# # dprint(f"模型推理时间: {elapsed_time} 秒")
# text = self.tokenizer.decode(output)
# model_time = time.time()
# elapsed_time = model_time - start_time
# elapsed_time = np.round(elapsed_time, decimals=3)
# output = text.split('<AI>')[-1][:-4]
# dprint(f"模型推理时间: {elapsed_time} 秒")
# return output
# class CPM9GGPTQQuantModel(Model):
# def __init__(self, name: str):
# self.except_count = 0
# self.name = name
# self.model = None
# self.tokenizer = CPM9GTokenizer(f"{cur_dir}/hf-8b-v2-gptq/4bits-128g-desc_act_True-damp_0.01/vocabs.txt")
#
# def process_exception(self) -> str:
# self.except_count += 1
# if self.except_count < 3:
# output = "我出现异常了"
# else:
# output = "还是有异常, 建议重启服务"
# return output
#
# def load_model(self) -> None:
# self.unload_model()
# dprint(f"{self.name}模型加载中...")
# start_time = time.time()
# self.model = AutoGPTQForCausalLM.from_quantized(f"{cur_dir}/hf-8b-v2-gptq/4bits-128g-desc_act_True-damp_0.01", device="cuda:0")
# self.model.eval()
# model_time = time.time()
# elapsed_time = model_time - start_time
# elapsed_time = np.round(elapsed_time, decimals=3)
# dprint(f"模型加载时间: {elapsed_time} 秒")
#
# def judge(self, question: str) -> str:
# if self.model == None:
# self.load_model()
# start_time = time.time()
# final_input = judge_template.replace("{question}", question)
# start_time = time.time()
# try:
# input_ids = torch.tensor([[self.tokenizer.bos_id] + self.tokenizer.encode(final_input)[:4096]]).cuda()
# output = self.model.generate(inputs=input_ids, max_new_tokens=2048, do_sample=False, num_beams=1)[0].tolist()
# except Exception:
# output = self.process_exception()
#
# end_time = time.time()
# # speed = len(output)/(end_time-start_time)
# text = self.tokenizer.decode(output)
# # print(f"speed:{speed:.2f}token/s max memory: {torch.cuda.max_memory_allocated(self.model.device)/ 1024**2:.2f}M")
# elapsed_time = end_time - start_time
# elapsed_time = np.round(elapsed_time, decimals=3)
#
# # 获取text,正则去除标点符号,去除正确的AI返回值
# print(text.split('<AI>')[-1][:-4])
# output = re.sub(r'[^\u4e00-\u9fa5a-zA-Z0-9 ;]','',text.split('<AI>')[-1][:-4])
# # 任务拆分
# job_list = output.split(';')
# print(output,job_list)
# def query(job):
# # 1. 读取数据(向量数据库)
# df = pd.read_excel('database.xlsx', header=0)
# texts = df.iloc[:,0].values
# # 2. 文本向量化
# model = SentenceTransformer('./paraphrase-multilingual-MiniLM-L12-v2')
# text_vectors = model.encode(texts, convert_to_numpy=True)
# d = text_vectors.shape[1] # 向量的维度
# # 3. 创建索引
# index = faiss.IndexFlatL2(d) # 使用L2距离的平面索引
# # 4. 添加数据到索引中
# index.add(text_vectors)
# # 5. 查询数据
# query_vector = model.encode([job], convert_to_numpy=True)
# distances, indices = index.search(query_vector, 1)
# # 阀值
# stresct = 2
# temp = 0
# # 查看阀值和预值
# # print(distances[0][0],stresct)
# if distances[0][0] > stresct:
# print("未在数据库中匹配该向量!")
# if "查找" in job or "文件" in job:
# temp = 2
# else:
# temp = 1
# text = texts[indices[0][0]]
# database_text = df[df['Key'] == text].iloc[:,:].values.tolist()[0]
# directive = database_text[1]
# if temp == 1:
# return ""
# elif temp == 2 or directive.startswith('find'):
# def find_parame(question,directive):
# directive_input = judge_directive_template.replace("{question}", question).replace("{directive}", directive)
# try:
# input_ids = torch.tensor([[self.tokenizer.bos_id] + self.tokenizer.encode(directive_input)[:4096]]).cuda()
# output = self.model.generate(inputs=input_ids, max_new_tokens=2048, do_sample=False, num_beams=1)[0].tolist()
# except Exception:
# output = self.process_exception()
# text = self.tokenizer.decode(output)
# output = re.sub(r'[^0-9]','',text.split('<AI>')[-1][:-4].split(':')[-1])
# print(output,text)
# if output == '':
# return 3
# return output
# parame = find_parame(job,directive)
# return directive + ' -{}'.format(parame)
# return directive
# complete_directives = []
# for job in job_list:
# if job != '':
# complete_directive = query(job)
# complete_directives.append(complete_directive)
# print(f"模型(judge)推理时间: {elapsed_time} 秒")
# return '&&&'.join(complete_directives)
#
# def wps_inference(self, question: str) -> str:
# question = question.replace('\r', '\n')
# input = f"<用户>{question}<AI>"
# if self.model == None:
# self.load_model()
# start_time = time.time()
# try:
# input_ids = torch.tensor([[self.tokenizer.bos_id] + self.tokenizer.encode(input)]).cuda()
# output = self.model.generate(inputs=input_ids, max_new_tokens=5130, do_sample=False, num_beams=1)[0].tolist()
# except Exception:
# output = self.process_exception()
#
# end_time = time.time()
# text = self.tokenizer.decode(output)
# print("text+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")
# print(text)
# print("+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")
# elapsed_time = end_time - start_time
# elapsed_time = np.round(elapsed_time, decimals=3)
# output = text.split('<AI>')[-1][:-4]
# dprint(f"模型(wps)推理时间: {elapsed_time} 秒")
# print("output+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")
# print(output)
# print("+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")
# return output
#
# def inference(self, data: List[str], prompt: str) -> str:
# if self.model == None:
# self.load_model()
# start_time = time.time()
# str = "".join(data)
# input = prompt+str+"<AI>"
# try:
# input_ids = torch.tensor([[self.tokenizer.bos_id] + self.tokenizer.encode(input)[:4096]]).cuda()
# output = self.model.generate(inputs=input_ids, max_new_tokens=2048, do_sample=False, num_beams=1)[0].tolist()
# except Exception:
# output = self.process_exception()
# end_time = time.time()
# text = self.tokenizer.decode(output)
# elapsed_time = end_time - start_time
# elapsed_time = np.round(elapsed_time, decimals=3)
# output = text.split('<AI>')[-1][:-4]
# dprint(f"模型(ask)推理时间: {elapsed_time} 秒")
# print(output)
# return output
# dashscope.api_key = "sk-06320ed501d84c8090f95636930dfc25"
# class QwenModel(Model):
# def inference(self, data: List[str], prompt: str) -> str:
# input = data[0]
# messages = [{'role': Role.SYSTEM, 'content': prompt},
# {'role': Role.USER, 'content': input}]
# for text in data:
# if text.startswith("<AI>"):
# messages.append({'role': Role.ASSISTANT, 'content': text[4:]})
# else:
# messages.append({'role': Role.USER, 'content': text[4:]})
# response = dashscope.Generation.call(
# dashscope.Generation.Models.qwen_turbo,
# messages=messages,
# result_format='message', # set the result to be "message" format.
# )
# if response.status_code == HTTPStatus.OK:
# return response['output']['choices'][0]['message']['content']
# else:
# return "网络出错了"
#
# def load_model(self) -> None:
# print("")
# class MNModel(Model):
# def inference(self, data: List[str], prompt: str) -> str:
# question= data[-1]
# url = "http://127.0.0.1:5000/mo_qa"
# headers = {
# 'Content-Type': 'application/json',
# "accept": "text/event-stream"
# }
# cache_id = "test",
# answer_index = 0
# use_buffer = False
# start_time = time.time()
# while True:
# params = {
# "question": question,
# "cache_id": cache_id,
# "answer_index": answer_index,
# "use_buffer": use_buffer
# }
# json_data = json.dumps(params)
# response = requests.request("POST", url, stream=True, headers=headers, data=json_data)
# if response.status_code == 200:
#
# answer = json.loads(response.text)
# answer_len = answer['answer_len']
#
# answer_index += 1
# if answer_len is not None:
# if use_buffer:
# mo_sent = answer['mo_sent']
# zh_sent = answer['zh_sent']
# break
# if answer_index > answer_len:
# use_buffer = True
#
# end_time = time.time()
# elapsed_time = end_time - start_time
# elapsed_time = np.round(elapsed_time, decimals=3)
# output = mo_sent + "\n" + zh_sent
# print(f"模型(ask)推理时间: {elapsed_time} 秒")
# # print(mo_sent + "\n" + zh_sent)
# return output
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化