代码拉取完成,页面将自动刷新
'''
Author: kun 56216004@qq.com
Date: 2023-06-26 11:56:05
LastEditors: kun 56216004@qq.com
LastEditTime: 2023-06-27 17:13:42
FilePath: \langchain\docAsk.py
Description: 这是默认设置,请设置`customMade`, 打开koroFileHeader查看配置 进行设置: https://github.com/OBKoro1/koro1FileHeader/wiki/%E9%85%8D%E7%BD%AE
'''
#
# (langchain39)
# pip install langchain
# Collecting langchain
# Downloading langchain-0.0.215-py3-none-any.whl (1.1 MB)
# pip install openai
# Collecting openai
# Downloading openai-0.27.8-py3-none-any.whl
# pip install jieba
# Collecting jieba
# Downloading jieba-0.42.1.tar.gz (19.2 MB)
# pip install unstructured
import os
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.text_splitter import TokenTextSplitter
from langchain.llms import OpenAI
from langchain.chains import ChatVectorDBChain
from langchain.document_loaders import DirectoryLoader
import jieba as jb
import openai
from pathlib import Path
my_file = Path(f"./data/cut/")
if not my_file.is_dir():
os.makedirs(my_file)
openai.api_base = "https://api.chatanywhere.com.cn/v1"
api_key = "sk-fMVlblKn6OebFV4m5X75KGmyIKZ97WPccCPzgWXDpgWlk482"
openai.api_key = api_key
files=['研发简要流程.txt','产品经理.txt']
import time
start_time = time.time()
for file in files:
#读取data文件夹中的中文文档
my_file=f"./data/{file}"
with open(my_file,"r",encoding='utf-8') as f:
data = f.read()
#对中文文档进行分词处理
cut_data = " ".join([w for w in list(jb.cut(data))])
#分词处理后的文档保存到data文件夹中的cut子文件夹中
cut_file=f"./data/cut/cut_{file}"
with open(cut_file, 'w',encoding='utf-8') as f:
f.write(cut_data) #'gbk' codec can't encode character '\uf06c' in position 1814: illegal multibyte sequence
f.close()
#加载文档
loader = DirectoryLoader('./data/cut',glob='**/*.txt')
docs = loader.load() #unstructured package not found, please install it with `pip install unstructured`
# Resource [93mpunkt[0m not found.
# Please use the NLTK Downloader to obtain the resource:
#文档切块
text_splitter = TokenTextSplitter(chunk_size=1000, chunk_overlap=0)
doc_texts = text_splitter.split_documents(docs)
#调用openai Embeddings
a=os.environ["OPENAI_API_KEY"] = ""
embeddings = OpenAIEmbeddings(openai_api_key=a)
#向量化
vectordb = Chroma.from_documents(doc_texts, embeddings, persist_directory="./data/cut")#Could not import chromadb python package. Please install it with `pip install chromadb`. 安装失败
vectordb.persist()
#创建聊天机器人对象chain
chain = ChatVectorDBChain.from_llm(OpenAI(temperature=0, model_name="gpt-3.5-turbo"), vectordb, return_source_documents=True)
def get_answer(question):
chat_history = []
result = chain({"question": question, "chat_history": chat_history})
return result["answer"]
question = "产品经理职位的核心职责是什么?"
print(get_answer(question))
end_time = time.time() # 程序结束时间
run_time = end_time - start_time # 程序的运行时间,单位为秒
print(run_time)
此处可能存在不合适展示的内容,页面不予展示。您可通过相关编辑功能自查并修改。
如您确认内容无涉及 不当用语 / 纯广告导流 / 暴力 / 低俗色情 / 侵权 / 盗版 / 虚假 / 无价值内容或违法国家有关法律法规的内容,可点击提交进行申诉,我们将尽快为您处理。