How to create unique IDS for files in the Chroma
We will also create a new collection in Chroma along the way
import os
import sys
from langchain.chains import ConversationalRetrievalChain
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.document_loaders import Docx2txtLoader
from langchain_community.document_loaders import TextLoader
from langchain_community.vectorstores import Chroma
import uuid
from langchain_openai import ChatOpenAI
from langchain_openai import OpenAIEmbeddings
import chromadb
os.environ["OPENAI_API_KEY"] = "sk-................................"
documents = []
# Traverse the directory tree and collect all files in the ./docs folder
for root, dirs, files in os.walk("./docs"):
for file in files:
file_path = os.path.join(root, file)
if file.endswith(".pdf"):
loader = PyPDFLoader(file_path)
documents.extend(loader.load())
elif file.endswith('.docx') or file.endswith('.doc'):
loader = Docx2txtLoader(file_path)
documents.extend(loader.load())
elif file.endswith('.txt'):
loader = TextLoader(file_path, encoding="utf-8")
documents.extend(loader.load())
# Split the documents into smaller chunks
document_splitter = CharacterTextSplitter(separator='\n', chunk_size=1000, chunk_overlap=100)
documents = document_splitter.split_documents(documents)
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
# Saving to the database and creating unique ids for each document
new_ids = [str(uuid.uuid5(uuid.NAMESPACE_DNS, doc.page_content)) for doc in documents]
# Creating a list of unique documents
seen_ids = set()
unique_docs = []
for doc, id in zip(documents, new_ids):
# Checking if the document id has not been processed yet
if id not in seen_ids:
seen_ids.add(id)
unique_docs.append(doc)
# Adding unique documents to the database
db = Chroma.from_documents(unique_docs, embeddings, ids=new_ids, collection_name="coll", persist_directory="./data")
In the next part of the article, we will utilize our unique IDS

Author: Bogdan Kuhar
Full Stack Developer/coach
https://www.youtube.com/@imimir_com
info@imimir.com