Vearch
Vearch is the vector search infrastructure for deeping learning and AI applications.
Setting up
Follow instructions.
You'll need to install langchain-community
with pip install -qU langchain-community
to use this integration
%pip install --upgrade --quiet vearch
# OR
%pip install --upgrade --quiet vearch_cluster
Example
from langchain_community.document_loaders import TextLoader
from langchain_community.vectorstores.vearch import Vearch
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from transformers import AutoModel, AutoTokenizer
# repalce to your local model path
model_path = "/data/zhx/zhx/langchain-ChatGLM_new/chatglm2-6b"
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
model = AutoModel.from_pretrained(model_path, trust_remote_code=True).half().cuda(0)
Loading checkpoint shards: 100%|██████████| 7/7 [00:07<00:00, 1.01s/it]
query = "你好!"
response, history = model.chat(tokenizer, query, history=[])
print(f"Human: {query}\nChatGLM:{response}\n")
query = "你知道凌波微步吗,你知道都有谁学会了吗?"
response, history = model.chat(tokenizer, query, history=history)
print(f"Human: {query}\nChatGLM:{response}\n")
Human: 你好!
ChatGLM:你 好👋!我是人工智能助手 ChatGLM2-6B,很高兴见到你,欢迎问我任何问题。
Human: 你知道凌波微步吗,你知道都有谁学会了吗?
ChatGLM:凌波微步是一种步伐,最早出自《倚天屠龙记》。在电视剧《人民的名义》中,侯亮平也学会了凌波微步。
# Add your local knowledge files
file_path = "/data/zhx/zhx/langchain-ChatGLM_new/knowledge_base/天龙八部/lingboweibu.txt" # Your local file path"
loader = TextLoader(file_path, encoding="utf-8")
documents = loader.load()
# split text into sentences and embedding the sentences
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
texts = text_splitter.split_documents(documents)
# replace to your model path
embedding_path = "/data/zhx/zhx/langchain-ChatGLM_new/text2vec/text2vec-large-chinese"
embeddings = HuggingFaceEmbeddings(model_name=embedding_path)
No sentence-transformers model found with name /data/zhx/zhx/langchain-ChatGLM_new/text2vec/text2vec-large-chinese. Creating a new one with MEAN pooling.
# first add your document into vearch vectorstore
vearch_standalone = Vearch.from_documents(
texts,
embeddings,
path_or_url="/data/zhx/zhx/langchain-ChatGLM_new/knowledge_base/localdb_new_test",
table_name="localdb_new_test",
flag=0,
)
print("***************after is cluster res*****************")
vearch_cluster = Vearch.from_documents(
texts,
embeddings,
path_or_url="http://test-vearch-langchain-router.vectorbase.svc.ht1.n.jd.local",
db_name="vearch_cluster_langchian",
table_name="tobenumone",
flag=1,
)
docids ['18ce6747dca04a2c833e60e8dfd83c04', 'aafacb0e46574b378a9f433877ab06a8', '9776bccfdd8643a8b219ccee0596f370']
***************after is cluster res*****************
docids ['1841638988191686991', '-4519586577642625749', '5028230008472292907']
query = "你知道凌波微步吗,你知道都有谁会凌波微步?"
vearch_standalone_res = vearch_standalone.similarity_search(query, 3)
for idx, tmp in enumerate(vearch_standalone_res):
print(f"{'#'*20}第{idx+1}段相关文档{'#'*20}\n\n{tmp.page_content}\n")
# combine your local knowleadge and query
context = "".join([tmp.page_content for tmp in vearch_standalone_res])
new_query = f"基于以下信息,尽可能准确的来回答用户的问题。背景信息:\n {context} \n 回答用户这个问题:{query}\n\n"
response, history = model.chat(tokenizer, new_query, history=[])
print(f"********ChatGLM:{response}\n")
print("***************************after is cluster res******************************")
query_c = "