利用langchain和ollama构建私有化知识库

kbsonlong

2024-08-30 约 1100 字预计阅读 3 分钟 - 次阅读

安装依赖

1
pip install -r requirements.txt

requirements.txt

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
aiohappyeyeballs==2.4.0
aiohttp==3.10.5
aiosignal==1.3.1
annotated-types==0.7.0
anyio==4.4.0
asgiref==3.8.1
async-timeout==4.0.3
attrs==24.2.0
backoff==2.2.1
bcrypt==4.2.0
build==1.2.1
cachetools==5.5.0
certifi==2024.8.30
charset-normalizer==3.3.2
chroma-hnswlib==0.7.3
chromadb==0.5.3
click==8.1.7
coloredlogs==15.0.1
dataclasses-json==0.6.7
Deprecated==1.2.14
docarray==0.40.0
exceptiongroup==1.2.2
fastapi==0.112.2
filelock==3.15.4
flatbuffers==24.3.25
frozenlist==1.4.1
fsspec==2024.6.1
google-auth==2.34.0
googleapis-common-protos==1.65.0
graphlib_backport==1.1.0
grpcio==1.66.1
h11==0.14.0
httpcore==1.0.5
httptools==0.6.1
httpx==0.27.2
huggingface-hub==0.24.6
humanfriendly==10.0
idna==3.8
importlib_metadata==8.4.0
importlib_resources==6.4.4
jsonpatch==1.33
jsonpointer==3.0.0
kubernetes==30.1.0
langchain==0.2.15
langchain-chroma==0.1.3
langchain-community==0.2.14
langchain-core==0.2.36
langchain-text-splitters==0.2.2
langsmith==0.1.107
markdown-it-py==3.0.0
marshmallow==3.22.0
mdurl==0.1.2
mmh3==4.1.0
monotonic==1.6
mpmath==1.3.0
multidict==6.0.5
mypy-extensions==1.0.0
numpy==1.24.4
oauthlib==3.2.2
onnxruntime==1.19.0
opentelemetry-api==1.27.0
opentelemetry-exporter-otlp-proto-common==1.27.0
opentelemetry-exporter-otlp-proto-grpc==1.27.0
opentelemetry-instrumentation==0.48b0
opentelemetry-instrumentation-asgi==0.48b0
opentelemetry-instrumentation-fastapi==0.48b0
opentelemetry-proto==1.27.0
opentelemetry-sdk==1.27.0
opentelemetry-semantic-conventions==0.48b0
opentelemetry-util-http==0.48b0
orjson==3.10.7
overrides==7.7.0
packaging==24.1
posthog==3.6.0
protobuf==4.25.4
pyasn1==0.6.0
pyasn1_modules==0.4.0
pydantic==2.8.2
pydantic_core==2.20.1
Pygments==2.18.0
PyPika==0.48.9
pyproject_hooks==1.1.0
PySocks==1.7.1
python-dateutil==2.9.0.post0
python-dotenv==1.0.1
PyYAML==6.0.2
regex==2024.7.24
requests==2.32.3
requests-oauthlib==2.0.0
rich==13.8.0
rsa==4.9
shellingham==1.5.4
six==1.16.0
sniffio==1.3.1
SQLAlchemy==2.0.32
starlette==0.38.2
sympy==1.13.2
tenacity==8.5.0
tiktoken==0.7.0
tokenizers==0.20.0
tomli==2.0.1
tqdm==4.66.5
typer==0.12.5
types-requests==2.32.0.20240712
typing-inspect==0.9.0
typing_extensions==4.12.2
urllib3==2.2.2
uvicorn==0.30.6
uvloop==0.20.0
watchfiles==0.24.0
websocket-client==1.8.0
websockets==13.0.1
wrapt==1.16.0
yarl==1.9.4
zipp==3.20.1

加载文档生成知识库向量数据库

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
import os
from langchain_community.document_loaders import TextLoader
from langchain_community.embeddings import OllamaEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_chroma import Chroma
from langchain.text_splitter import CharacterTextSplitter

# 初始化嵌入函数
embeddings = OllamaEmbeddings(model='nomic-embed-text')

# 指定持久化目录
persist_directory = './chroma_db'

# 加载已有的 Chroma 数据库
try:
    vectorstore = Chroma(persist_directory=persist_directory, embedding_function=embeddings)
except Exception as e:
    print(f"未能加载现有数据库，将创建一个新的数据库：{e}")
    vectorstore = Chroma(embedding_function=embeddings)

# 处理每个文件的数据并增量式持久化
def process_and_persist_book(file_path, vectorstore):
    # 读取文件并分词
    documents = TextLoader(file_path).load()
    text_splitter = CharacterTextSplitter.from_tiktoken_encoder(chunk_size=7500, chunk_overlap=100)
    doc_splits = text_splitter.split_documents(documents)

    # 嵌入并存储
    vectorstore.add_documents(doc_splits)

# 处理每本书
# process_and_persist_book('./水浒传.txt', vectorstore)
# process_and_persist_book('./红楼梦.txt', vectorstore)
# process_and_persist_book('./西游记.txt', vectorstore)
# process_and_persist_book('./隋唐演义.txt', vectorstore)
directory_path = './books/'  # 目录路径
for root, dirs, files in os.walk(directory_path):
    for file in files:
        if file.endswith('.txt'):
            file_path = os.path.join(root, file)
            print(f"处理文件: {file_path}")
            process_and_persist_book(file_path, vectorstore)

# # 最后持久化整个数据库
# vectorstore.persist()

加载向量数据

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
from langchain_community import embeddings
from langchain_community.chat_models import ChatOllama
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_community.embeddings import OllamaEmbeddings
from langchain_chroma import Chroma

ollama_host = "http://localhost:11434"


# 创建嵌入函数实例
embeddings = OllamaEmbeddings(base_url=ollama_host,model='nomic-embed-text')
# 加载之前保存的数据
vectorstore = Chroma(persist_directory='./chroma_db', embedding_function=embeddings)
retriever = vectorstore.as_retriever()

# 3. 向模型提问
model_local = ChatOllama(base_url=ollama_host,model="qwen:7b")
template = """Answer the question based only on the following context:
{context}
Question: {question}
"""
prompt = ChatPromptTemplate.from_template(template)
chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | model_local
    | StrOutputParser()
)
print(chain.invoke("贾宝玉是什么样的角色?"))

1
贾宝玉是《红楼梦》中的主要人物之一，被作者赋予了特殊的地位——“古今第一情种”。贾宝玉性格复杂，既有封建社会公子哥儿的娇纵和任性，又具有超越世俗的叛逆精神和对女性的深深同情。在情感关系上，贾宝玉与林黛玉、薛宝钗等女子有着复杂的纠葛和深厚的感情纽带