How to create unique IDS for files in the Chroma
We will also create a new collection in Chroma along the way
import os import sys from langchain.chains import ConversationalRetrievalChain from langchain.text_splitter import CharacterTextSplitter from langchain_community.document_loaders import PyPDFLoader from langchain_community.document_loaders import Docx2txtLoader from langchain_community.document_loaders import TextLoader from langchain_community.vectorstores import Chroma import uuid from langchain_openai import ChatOpenAI from langchain_openai import OpenAIEmbeddings import chromadb os.environ["OPENAI_API_KEY"] = "sk-................................" documents = [] # Traverse the directory tree and collect all files in the ./docs folder for root, dirs, files in os.walk("./docs"): for file in files: file_path = os.path.join(root, file) if file.endswith(".pdf"): loader = PyPDFLoader(file_path) documents.extend(loader.load()) elif file.endswith('.docx') or file.endswith('.doc'): loader = Docx2txtLoader(file_path) documents.extend(loader.load()) elif file.endswith('.txt'): loader = TextLoader(file_path, encoding="utf-8") documents.extend(loader.load()) # Split the documents into smaller chunks document_splitter = CharacterTextSplitter(separator='\n', chunk_size=1000, chunk_overlap=100) documents = document_splitter.split_documents(documents) embeddings = OpenAIEmbeddings(model="text-embedding-3-small") # Saving to the database and creating unique ids for each document new_ids = [str(uuid.uuid5(uuid.NAMESPACE_DNS, doc.page_content)) for doc in documents] # Creating a list of unique documents seen_ids = set() unique_docs = [] for doc, id in zip(documents, new_ids): # Checking if the document id has not been processed yet if id not in seen_ids: seen_ids.add(id) unique_docs.append(doc) # Adding unique documents to the database db = Chroma.from_documents(unique_docs, embeddings, ids=new_ids, collection_name="coll", persist_directory="./data")
In the next part of the article, we will utilize our unique IDS
Author: Bogdan Kuhar
Full Stack Developer/coach
https://www.youtube.com/@imimir_com
info@imimir.com