import os
from langchain.llms import OpenAI
import bs4
import langchain
from langchain import hub
from langchain.document_loaders import UnstructuredFileLoader
from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
os.environ["OPENAI_API_KEY"] = "KEY"
loader = UnstructuredFileLoader(
'path_to_file'
)
docs = loader.load()
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=1000, chunk_overlap=200, add_start_index=True
)
all_splits = text_splitter.split_documents(docs)
vectorstore = Chroma.from_documents(documents=all_splits, embedding=OpenAIEmbeddings())
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 6})
retrieved_docs = retriever.get_relevant_documents(
"What is X?"
)
This returns:
[Document(page_content="...", metadata={'source': 'path_to_text', 'start_index': 16932}),
Document(page_content="...", metadata={'source': 'path_to_text', 'start_index': 16932}),
Document(page_content="...", metadata={'source': 'path_to_text', 'start_index': 16932}),
Document(page_content="...", metadata={'source': 'path_to_text', 'start_index': 16932}),
Document(page_content="...", metadata={'source': 'path_to_text', 'start_index': 16932}),
Document(page_content="...", metadata={'source': 'path_to_text', 'start_index': 16932})]
Which is all seemingly the same document.
When I first ran this code in Google Colab/Jupyter Notebook, it returned different documents...as I ran it more, it started returning the same documents. Makes me feel like this is a database issue, where the same entry is being inserted into the database with each run.
How do I return 6 different unique documents?