kbsonlong
2024-08-30 约 1100 字 预计阅读 3 分钟
安装依赖
1
pip install -r requirements.txt
requirements.txt
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
aiohappyeyeballs==2.4.0
aiohttp==3.10.5
aiosignal==1.3.1
annotated-types==0.7.0
anyio==4.4.0
asgiref==3.8.1
async-timeout==4.0.3
attrs==24.2.0
backoff==2.2.1
bcrypt==4.2.0
build==1.2.1
cachetools==5.5.0
certifi==2024.8.30
charset-normalizer==3.3.2
chroma-hnswlib==0.7.3
chromadb==0.5.3
click==8.1.7
coloredlogs==15.0.1
dataclasses-json==0.6.7
Deprecated==1.2.14
docarray==0.40.0
exceptiongroup==1.2.2
fastapi==0.112.2
filelock==3.15.4
flatbuffers==24.3.25
frozenlist==1.4.1
fsspec==2024.6.1
google-auth==2.34.0
googleapis-common-protos==1.65.0
graphlib_backport==1.1.0
grpcio==1.66.1
h11==0.14.0
httpcore==1.0.5
httptools==0.6.1
httpx==0.27.2
huggingface-hub==0.24.6
humanfriendly==10.0
idna==3.8
importlib_metadata==8.4.0
importlib_resources==6.4.4
jsonpatch==1.33
jsonpointer==3.0.0
kubernetes==30.1.0
langchain==0.2.15
langchain-chroma==0.1.3
langchain-community==0.2.14
langchain-core==0.2.36
langchain-text-splitters==0.2.2
langsmith==0.1.107
markdown-it-py==3.0.0
marshmallow==3.22.0
mdurl==0.1.2
mmh3==4.1.0
monotonic==1.6
mpmath==1.3.0
multidict==6.0.5
mypy-extensions==1.0.0
numpy==1.24.4
oauthlib==3.2.2
onnxruntime==1.19.0
opentelemetry-api==1.27.0
opentelemetry-exporter-otlp-proto-common==1.27.0
opentelemetry-exporter-otlp-proto-grpc==1.27.0
opentelemetry-instrumentation==0.48b0
opentelemetry-instrumentation-asgi==0.48b0
opentelemetry-instrumentation-fastapi==0.48b0
opentelemetry-proto==1.27.0
opentelemetry-sdk==1.27.0
opentelemetry-semantic-conventions==0.48b0
opentelemetry-util-http==0.48b0
orjson==3.10.7
overrides==7.7.0
packaging==24.1
posthog==3.6.0
protobuf==4.25.4
pyasn1==0.6.0
pyasn1_modules==0.4.0
pydantic==2.8.2
pydantic_core==2.20.1
Pygments==2.18.0
PyPika==0.48.9
pyproject_hooks==1.1.0
PySocks==1.7.1
python-dateutil==2.9.0.post0
python-dotenv==1.0.1
PyYAML==6.0.2
regex==2024.7.24
requests==2.32.3
requests-oauthlib==2.0.0
rich==13.8.0
rsa==4.9
shellingham==1.5.4
six==1.16.0
sniffio==1.3.1
SQLAlchemy==2.0.32
starlette==0.38.2
sympy==1.13.2
tenacity==8.5.0
tiktoken==0.7.0
tokenizers==0.20.0
tomli==2.0.1
tqdm==4.66.5
typer==0.12.5
types-requests==2.32.0.20240712
typing-inspect==0.9.0
typing_extensions==4.12.2
urllib3==2.2.2
uvicorn==0.30.6
uvloop==0.20.0
watchfiles==0.24.0
websocket-client==1.8.0
websockets==13.0.1
wrapt==1.16.0
yarl==1.9.4
zipp==3.20.1
加载文档生成知识库向量数据库
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
import os
from langchain_community.document_loaders import TextLoader
from langchain_community.embeddings import OllamaEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_chroma import Chroma
from langchain.text_splitter import CharacterTextSplitter
# 初始化嵌入函数
embeddings = OllamaEmbeddings ( model = 'nomic-embed-text' )
# 指定持久化目录
persist_directory = './chroma_db'
# 加载已有的 Chroma 数据库
try :
vectorstore = Chroma ( persist_directory = persist_directory , embedding_function = embeddings )
except Exception as e :
print ( f "未能加载现有数据库,将创建一个新的数据库: { e } " )
vectorstore = Chroma ( embedding_function = embeddings )
# 处理每个文件的数据并增量式持久化
def process_and_persist_book ( file_path , vectorstore ):
# 读取文件并分词
documents = TextLoader ( file_path ) . load ()
text_splitter = CharacterTextSplitter . from_tiktoken_encoder ( chunk_size = 7500 , chunk_overlap = 100 )
doc_splits = text_splitter . split_documents ( documents )
# 嵌入并存储
vectorstore . add_documents ( doc_splits )
# 处理每本书
# process_and_persist_book('./水浒传.txt', vectorstore)
# process_and_persist_book('./红楼梦.txt', vectorstore)
# process_and_persist_book('./西游记.txt', vectorstore)
# process_and_persist_book('./隋唐演义.txt', vectorstore)
directory_path = './books/' # 目录路径
for root , dirs , files in os . walk ( directory_path ):
for file in files :
if file . endswith ( '.txt' ):
file_path = os . path . join ( root , file )
print ( f "处理文件: { file_path } " )
process_and_persist_book ( file_path , vectorstore )
# # 最后持久化整个数据库
# vectorstore.persist()
加载向量数据
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
from langchain_community import embeddings
from langchain_community.chat_models import ChatOllama
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_community.embeddings import OllamaEmbeddings
from langchain_chroma import Chroma
ollama_host = "http://localhost:11434"
# 创建嵌入函数实例
embeddings = OllamaEmbeddings ( base_url = ollama_host , model = 'nomic-embed-text' )
# 加载之前保存的数据
vectorstore = Chroma ( persist_directory = './chroma_db' , embedding_function = embeddings )
retriever = vectorstore . as_retriever ()
# 3. 向模型提问
model_local = ChatOllama ( base_url = ollama_host , model = "qwen:7b" )
template = """Answer the question based only on the following context:
{context}
Question: {question}
"""
prompt = ChatPromptTemplate . from_template ( template )
chain = (
{ "context" : retriever , "question" : RunnablePassthrough ()}
| prompt
| model_local
| StrOutputParser ()
)
print ( chain . invoke ( "贾宝玉是什么样的角色?" ))
1
贾宝玉是《红楼梦》中的主要人物之一,被作者赋予了特殊的地位——“古今第一情种”。贾宝玉性格复杂,既有封建社会公子哥儿的娇纵和任性,又具有超越世俗的叛逆精神和对女性的深深同情。在情感关系上,贾宝玉与林黛玉、薛宝钗等女子有着复杂的纠葛和深厚的感情纽带