How to upload all the core files from the directory using langchain

import os
import sys
from dotenv import load_dotenv
from langchain.chains import ConversationalRetrievalChain
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.document_loaders import Docx2txtLoader
from langchain_community.document_loaders import TextLoader
from langchain_community.document_loaders import UnstructuredXMLLoader
from langchain_community.document_loaders import UnstructuredExcelLoader
from langchain_community.document_loaders import UnstructuredPowerPointLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
import uuid
from langchain_openai import ChatOpenAI
from langchain_openai import OpenAIEmbeddings
import chromadb

from langchain_community.document_loaders import DirectoryLoader

os.environ["OPENAI_API_KEY"] = "sk-............................................"

def process_documents(directory):
    documents = []
    for root, dirs, files in os.walk(directory):
        for file in files:
            file_path = os.path.join(root, file)
            if file.endswith(".pdf"):
                loader = PyPDFLoader(str(file_path))
                documents.extend(loader.load())
            elif file.endswith('.docx') or file.endswith('.doc'):
                loader = Docx2txtLoader(str(file_path))
                documents.extend(loader.load())
            elif file.endswith('.txt'):
                loader = TextLoader(str(file_path), encoding="utf-8")
                documents.extend(loader.load())
            elif file.endswith('.csv'):
                loader = TextLoader(str(file_path), encoding="utf-8")
                documents.extend(loader.load())
            elif file.endswith('.xml'):
                loader = UnstructuredXMLLoader(str(file_path), encoding="utf-8")
                documents.extend(loader.load())
            elif file.endswith('.xlsx') or file.endswith('.xls'):
                loader = UnstructuredExcelLoader(str(file_path))
                documents.extend(loader.load())
            elif file.endswith('.pptx'):
                loader = UnstructuredPowerPointLoader(str(file_path))
                documents.extend(loader.load())

    # Split the documents into smaller chunks
    document_splitter = RecursiveCharacterTextSplitter(chunk_size=3000, chunk_overlap=100)
    documents = document_splitter.split_documents(documents)

    return documents


# Usage example:
directory = "./docs"
processed_documents = process_documents(directory)

This code snippet is designed to traverse through a specified directory and load various types of documents using the langchain library and its associated community extensions. Here’s a breakdown of what it does and the types of files it can load:

Traversing Directory: The code walks through the directory tree starting from “./docs”.
Loading Documents: Depending on the file extension, it loads different types of documents:
- PDF Files: It uses the PyPDFLoader to load PDF documents.
- Microsoft Word Files (.docx, .doc): It employs the Docx2txtLoader to load Word documents.
- Text Files (.txt): It utilizes the TextLoader to load plain text files.
- CSV Files (.csv): Similarly, it uses the TextLoader to load CSV files.
- XML Files (.xml): It uses the UnstructuredXMLLoader to load XML files.
- Microsoft Excel Files (.xlsx, .xls): It plans to load Excel files, though this part seems incomplete in the provided code snippet.
- PowerPoint Files (.pptx): It utilizes the UnstructuredPowerPointLoader to load PowerPoint files.
Splitting Documents: After loading, it splits the documents into smaller chunks to facilitate further processing. This step is essential for handling large documents efficiently.

Overall, this code snippet provides a comprehensive approach to loading various document types from a directory, making it suitable for applications that require processing diverse sets of textual data.

Bohdan Kukhar

Author: Bogdan Kuhar
Full Stack Developer/coach
https://www.youtube.com/@imimir_com

info@imimir.com

Leave a Comment Cancel reply