I have around 30 GB of JSON data with multiple files, wanted build query bot on this. I have built same with text file but i am not sure how it will work for JSON data.
I have explored JSONLoader but dont know how to use this to convert JSON data into vector and store it into ChromaDB so that i can query them. https://python.langchain.com/docs/modules/data_connection/document_loaders/json
Sample JSON File : http://jsonblob.com/1147948130921996288
Code for Text data:
# Loading and Splitting the Documents
from langchain.document_loaders import DirectoryLoader
directory = '/content/drive/MyDrive/Data Science/LLM/docs/text files'
def load_docs(directory):
loader = DirectoryLoader(directory)
documents = loader.load()
return documents
documents = load_docs(directory)
len(documents)
from langchain.text_splitter import RecursiveCharacterTextSplitter
def split_docs(documents,chunk_size=1000,chunk_overlap=20):
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
docs = text_splitter.split_documents(documents)
return docs
docs = split_docs(documents)
print(len(docs))
# Embedding Text Using Langchain
from langchain.embeddings import SentenceTransformerEmbeddings
embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
#Creating Vector Store with Chroma DB
from langchain.vectorstores import Chroma
persist_directory = "/content/drive/MyDrive/Data Science/LLM/docs/chroma_db"
vectordb = Chroma.from_documents(
documents=docs, embedding=embeddings, persist_directory=persist_directory
)
vectordb.persist()
#Using OpenAI Large Language Models (LLM) with Chroma DB
import os
os.environ["OPENAI_API_KEY"] = "sk-your-key"
from langchain.chat_models import ChatOpenAI
model_name = "gpt-3.5-turbo"
llm = ChatOpenAI(model_name=model_name)
#Extracting Answers from Documents
from langchain.chains.question_answering import load_qa_chain
chain = load_qa_chain(llm, chain_type="stuff",verbose=True)
query = "who is Mr. Jabez Wilson?"
matching_docs = vectordb.similarity_search(query)
answer = chain.run(input_documents=matching_docs, question=query)
answer
What I tried for JSON Data :
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import SentenceTransformerEmbeddings
from langchain.vectorstores import Chroma
from langchain.chat_models import ChatOpenAI
from langchain.chains.question_answering import load_qa_chain
from langchain.document_loaders import DirectoryLoader
from langchain.document_loaders import JSONLoader
import json
# Define a simple JSON schema (modify as needed)
json_schema = {
}
# Function to validate a JSON document against a schema
def validate_json(json_data, schema):
return all(key in json_data for key in schema.keys())
# 1. Load JSON Files
def load_json_docs(directory):
loader = DirectoryLoader(directory, glob='**/*.json', loader_cls=JSONLoader)
documents = loader.load()
# Manually filter and validate documents based on the JSON schema
valid_documents = []
for doc in documents:
try:
# Parse the JSON content
json_data = json.loads(doc.page_content)
if validate_json(json_data, json_schema):
valid_documents.append(doc)
except json.JSONDecodeError:
pass # Invalid JSON format, skip this document
return valid_documents
directory = '/content/drive/MyDrive/Data Science/LLM/docs/json files'
json_documents = load_json_docs(directory)
len(json_documents)
# 2. Split JSON Documents
def split_json_docs(documents, chunk_size=1000, chunk_overlap=20):
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
docs = text_splitter.split_documents(documents)
return docs
split_json_documents = split_json_docs(json_documents)
print(len(split_json_documents))
# 3. Embedding Text Using Langchain
embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
# 4. Creating Vector Store with Chroma DB
persist_directory = "/content/drive/MyDrive/Data Science/LLM/docs/chroma_json_db"
vectordb = Chroma.from_documents(
documents=split_json_documents, embedding=embeddings, persist_directory=persist_directory
)
vectordb.persist()
# 5. Using OpenAI Large Language Models (LLM) with Chroma DB
import os
os.environ["OPENAI_API_KEY"] = "sk-your-key"
model_name = "gpt-3.5-turbo"
llm = ChatOpenAI(model_name=model_name)
# 6. Extracting Answers from Documents
chain = load_qa_chain(llm, chain_type="stuff", verbose=True)
query = "who is Mr. Jabez Wilson?"
matching_docs = vectordb.similarity_search(query)
answer = chain.run(input_documents=matching_docs, question=query)
answer