How to create unique IDS for files in the Chroma

How to create unique IDS for files in the Chroma

We will also create a new collection in Chroma along the way

import os
import sys
from langchain.chains import ConversationalRetrievalChain
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.document_loaders import Docx2txtLoader
from langchain_community.document_loaders import TextLoader
from langchain_community.vectorstores import Chroma
import uuid
from langchain_openai import ChatOpenAI
from langchain_openai import OpenAIEmbeddings
import chromadb

os.environ["OPENAI_API_KEY"] = "sk-................................"

documents = []

# Traverse the directory tree and collect all files in the ./docs folder
for root, dirs, files in os.walk("./docs"):
    for file in files:
        file_path = os.path.join(root, file)
        if file.endswith(".pdf"):
            loader = PyPDFLoader(file_path)
            documents.extend(loader.load())
        elif file.endswith('.docx') or file.endswith('.doc'):
            loader = Docx2txtLoader(file_path)
            documents.extend(loader.load())
        elif file.endswith('.txt'):
            loader = TextLoader(file_path, encoding="utf-8")
            documents.extend(loader.load())


# Split the documents into smaller chunks

document_splitter = CharacterTextSplitter(separator='\n', chunk_size=1000, chunk_overlap=100)
documents = document_splitter.split_documents(documents)

embeddings = OpenAIEmbeddings(model="text-embedding-3-small")


# Saving to the database and creating unique ids for each document
new_ids = [str(uuid.uuid5(uuid.NAMESPACE_DNS, doc.page_content)) for doc in documents]

# Creating a list of unique documents
seen_ids = set()
unique_docs = []

for doc, id in zip(documents, new_ids):
    # Checking if the document id has not been processed yet
    if id not in seen_ids:
        seen_ids.add(id)
        unique_docs.append(doc)

# Adding unique documents to the database
db = Chroma.from_documents(unique_docs, embeddings, ids=new_ids, collection_name="coll", persist_directory="./data")



In the next part of the article, we will utilize our unique IDS

 

Leave a Comment