How to upload all the core files from the directory using langchain

How to upload all the core files from the directory using langchain

import os
import sys
from dotenv import load_dotenv
from langchain.chains import ConversationalRetrievalChain
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.document_loaders import Docx2txtLoader
from langchain_community.document_loaders import TextLoader
from langchain_community.document_loaders import UnstructuredXMLLoader
from langchain_community.document_loaders import UnstructuredExcelLoader
from langchain_community.document_loaders import UnstructuredPowerPointLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
import uuid
from langchain_openai import ChatOpenAI
from langchain_openai import OpenAIEmbeddings
import chromadb

from langchain_community.document_loaders import DirectoryLoader

os.environ["OPENAI_API_KEY"] = "sk-............................................"

def process_documents(directory):
    documents = []
    for root, dirs, files in os.walk(directory):
        for file in files:
            file_path = os.path.join(root, file)
            if file.endswith(".pdf"):
                loader = PyPDFLoader(str(file_path))
                documents.extend(loader.load())
            elif file.endswith('.docx') or file.endswith('.doc'):
                loader = Docx2txtLoader(str(file_path))
                documents.extend(loader.load())
            elif file.endswith('.txt'):
                loader = TextLoader(str(file_path), encoding="utf-8")
                documents.extend(loader.load())
            elif file.endswith('.csv'):
                loader = TextLoader(str(file_path), encoding="utf-8")
                documents.extend(loader.load())
            elif file.endswith('.xml'):
                loader = UnstructuredXMLLoader(str(file_path), encoding="utf-8")
                documents.extend(loader.load())
            elif file.endswith('.xlsx') or file.endswith('.xls'):
                loader = UnstructuredExcelLoader(str(file_path))
                documents.extend(loader.load())
            elif file.endswith('.pptx'):
                loader = UnstructuredPowerPointLoader(str(file_path))
                documents.extend(loader.load())

    # Split the documents into smaller chunks
    document_splitter = RecursiveCharacterTextSplitter(chunk_size=3000, chunk_overlap=100)
    documents = document_splitter.split_documents(documents)

    return documents


# Usage example:
directory = "./docs"
processed_documents = process_documents(directory)

This code snippet is designed to traverse through a specified directory and load various types of documents using the langchain library and its associated community extensions. Here’s a breakdown of what it does and the types of files it can load:

  1. Traversing Directory: The code walks through the directory tree starting from “./docs”.
  2. Loading Documents: Depending on the file extension, it loads different types of documents:
    • PDF Files: It uses the PyPDFLoader to load PDF documents.
    • Microsoft Word Files (.docx, .doc): It employs the Docx2txtLoader to load Word documents.
    • Text Files (.txt): It utilizes the TextLoader to load plain text files.
    • CSV Files (.csv): Similarly, it uses the TextLoader to load CSV files.
    • XML Files (.xml): It uses the UnstructuredXMLLoader to load XML files.
    • Microsoft Excel Files (.xlsx, .xls): It plans to load Excel files, though this part seems incomplete in the provided code snippet.
    • PowerPoint Files (.pptx): It utilizes the UnstructuredPowerPointLoader to load PowerPoint files.
  3. Splitting Documents: After loading, it splits the documents into smaller chunks to facilitate further processing. This step is essential for handling large documents efficiently.

Overall, this code snippet provides a comprehensive approach to loading various document types from a directory, making it suitable for applications that require processing diverse sets of textual data.

Leave a Comment