diff --git a/docs/core_docs/docs/integrations/document_loaders/web_loaders/dropbox.mdx b/docs/core_docs/docs/integrations/document_loaders/web_loaders/dropbox.mdx new file mode 100644 index 000000000000..1f9eb5857c61 --- /dev/null +++ b/docs/core_docs/docs/integrations/document_loaders/web_loaders/dropbox.mdx @@ -0,0 +1,138 @@ +--- +hide_table_of_contents: true +sidebar_class_name: node-only +--- + +# Dropbox Loader + +The `DropboxLoader` allows you to load documents from Dropbox into your LangChain applications. It retrieves files or directories from your Dropbox account and converts them into documents ready for processing. + +## Overview + +Dropbox is a file hosting service that brings all your files—traditional documents, cloud content, and web shortcuts—together in one place. With the `DropboxLoader`, you can seamlessly integrate Dropbox file retrieval into your projects. + +## Setup + +1. Create a dropbox app, using the [Dropbox App Console](https://www.dropbox.com/developers/apps/create). +2. Ensure the app has the `files.metadata.read`, `files.content.read` scope permissions: +3. Generate the access token from the Dropbox App Console. +4. To use this loader, you'll need to have Unstructured already set up and ready to use at an available URL endpoint. It can also be configured to run locally. + See the docs [here](https://www.dropbox.com/developers/apps/create) for information on how to do that. +5. Install the necessary packages: + + ```bash npm2yarn + npm install @langchain/community @langchain/core dropbox + ``` + +## Usage + +### Loading Specific Files + +To load specific files from Dropbox, specify the file paths: + +```typescript +import { DropboxLoader } from "@langchain/community/document_loaders/web/dropbox"; + +const loader = new DropboxLoader({ + clientOptions: { + accessToken: "your-dropbox-access-token", + }, + unstructuredOptions: { + apiUrl: "http://localhost:8000/general/v0/general", // Replace with your Unstructured API URL + }, + filePaths: ["/path/to/file1.txt", "/path/to/file2.pdf"], // Replace with file paths on Dropbox. +}); + +const docs = await loader.load(); +console.log(docs); +``` + +### Loading Files from a Directory + +To load all files from a specific directory, provide the `folderPath` and set the `mode` to `"directory"`. Set `recursive` to `true` to traverse subdirectories: + +```typescript +import { DropboxLoader } from "@langchain/community/document_loaders/web/dropbox"; + +const loader = new DropboxLoader({ + clientOptions: { + accessToken: "your-dropbox-access-token", + }, + unstructuredOptions: { + apiUrl: "http://localhost:8000/general/v0/general", + }, + folderPath: "/path/to/folder", + recursive: true, // Load documents found in subdirectories + mode: "directory", +}); + +const docs = await loader.load(); +console.log(docs); +``` + +### Streaming Documents + +To process large datasets efficiently, use the `loadLazy` method to stream documents asynchronously: + +```typescript +import { DropboxLoader } from "@langchain/community/document_loaders/web/dropbox"; + +const loader = new DropboxLoader({ + clientOptions: { + accessToken: "your-dropbox-access-token", + }, + unstructuredOptions: { + apiUrl: "http://localhost:8000/general/v0/general", + }, + folderPath: "/large/dataset", + recursive: true, + mode: "directory", +}); + +for await (const doc of loader.loadLazy()) { + // Process each document as it's loaded + console.log(doc); +} +``` + +### Authentication with Environment Variables + +You can set the `DROPBOX_ACCESS_TOKEN` environment variable instead of passing the access token in `clientOptions`: + +```bash +export DROPBOX_ACCESS_TOKEN=your-dropbox-access-token +``` + +Then initialize the loader without specifying `accessToken`: + +```typescript +import { DropboxLoader } from "@langchain/community/document_loaders/web/dropbox"; + +const loader = new DropboxLoader({ + clientOptions: {}, + unstructuredOptions: { + apiUrl: "http://localhost:8000/general/v0/general", + }, + filePaths: ["/important/notes.txt"], +}); + +const docs = await loader.load(); +console.log(docs[0].pageContent); +``` + +## Configuration Options + +Here are the configuration options for the `DropboxLoader`: + +| Option | Type | Description | +| --------------------- | ------------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `clientOptions` | `DropboxOptions` | Configuration options for initializing the Dropbox client, including authentication details. Refer to the [Dropbox SDK Documentation](https://dropbox.github.io/dropbox-sdk-js/Dropbox.html#Dropbox__anchor) for more information. | +| `unstructuredOptions` | `UnstructuredLoaderOptions` | Options for the `UnstructuredLoader` used to process downloaded files. Includes the `apiUrl` for your Unstructured server. | +| `folderPath` | `string` (optional) | The path to the folder in Dropbox from which to load files. Defaults to the root folder (`""`) if not specified. | +| `filePaths` | `string[]` (optional) | An array of specific file paths in Dropbox to load. Required if `mode` is set to `"file"`. | +| `recursive` | `boolean` (optional) | Indicates whether to recursively traverse folders when `mode` is `"directory"`. Defaults to `false`. | +| `mode` | `"file"` or `"directory"` (optional) | The mode of operation. Set to `"file"` to load specific files or `"directory"` to load all files in a directory. Defaults to `"file"`. | + +## API References + +- [Dropbox SDK for JavaScript](https://github.com/dropbox/dropbox-sdk-js) diff --git a/langchain/.env.example b/langchain/.env.example index 2eda74311a41..d91802e79f0f 100644 --- a/langchain/.env.example +++ b/langchain/.env.example @@ -102,3 +102,4 @@ GOOGLE_ROUTES_API_KEY=ADD_YOURS_HERE CONFLUENCE_USERNAME=ADD_YOURS_HERE CONFLUENCE_PASSWORD=ADD_YOURS_HERE CONFLUENCE_PATH=ADD_YOURS_HERE +DROPBOX_ACCESS_TOKEN=ADD_YOURS_HERE \ No newline at end of file diff --git a/libs/langchain-community/.gitignore b/libs/langchain-community/.gitignore index e6ae5fa54a4f..b0ccac535902 100644 --- a/libs/langchain-community/.gitignore +++ b/libs/langchain-community/.gitignore @@ -874,6 +874,10 @@ document_loaders/web/cheerio.cjs document_loaders/web/cheerio.js document_loaders/web/cheerio.d.ts document_loaders/web/cheerio.d.cts +document_loaders/web/dropbox.cjs +document_loaders/web/dropbox.js +document_loaders/web/dropbox.d.ts +document_loaders/web/dropbox.d.cts document_loaders/web/html.cjs document_loaders/web/html.js document_loaders/web/html.d.ts diff --git a/libs/langchain-community/langchain.config.js b/libs/langchain-community/langchain.config.js index 4a402c6941e8..75fc7ff4de73 100644 --- a/libs/langchain-community/langchain.config.js +++ b/libs/langchain-community/langchain.config.js @@ -273,6 +273,7 @@ export const config = { "document_loaders/web/azure_blob_storage_file", "document_loaders/web/browserbase": "document_loaders/web/browserbase", "document_loaders/web/cheerio": "document_loaders/web/cheerio", + "document_loaders/web/dropbox": "document_loaders/web/dropbox", "document_loaders/web/html": "document_loaders/web/html", "document_loaders/web/puppeteer": "document_loaders/web/puppeteer", "document_loaders/web/playwright": "document_loaders/web/playwright", @@ -493,6 +494,7 @@ export const config = { "document_loaders/web/azure_blob_storage_file", "document_loaders/web/browserbase", "document_loaders/web/cheerio", + "document_loaders/web/dropbox", "document_loaders/web/puppeteer", "document_loaders/web/playwright", "document_loaders/web/college_confidential", diff --git a/libs/langchain-community/package.json b/libs/langchain-community/package.json index 7b826ad1e106..a84f9d8c8ba6 100644 --- a/libs/langchain-community/package.json +++ b/libs/langchain-community/package.json @@ -37,6 +37,7 @@ "dependencies": { "@langchain/openai": ">=0.2.0 <0.4.0", "binary-extensions": "^2.2.0", + "dropbox": "^10.34.0", "expr-eval": "^2.0.2", "flat": "^5.0.2", "js-yaml": "^4.1.0", @@ -2683,6 +2684,15 @@ "import": "./document_loaders/web/cheerio.js", "require": "./document_loaders/web/cheerio.cjs" }, + "./document_loaders/web/dropbox": { + "types": { + "import": "./document_loaders/web/dropbox.d.ts", + "require": "./document_loaders/web/dropbox.d.cts", + "default": "./document_loaders/web/dropbox.d.ts" + }, + "import": "./document_loaders/web/dropbox.js", + "require": "./document_loaders/web/dropbox.cjs" + }, "./document_loaders/web/html": { "types": { "import": "./document_loaders/web/html.d.ts", @@ -3977,6 +3987,10 @@ "document_loaders/web/cheerio.js", "document_loaders/web/cheerio.d.ts", "document_loaders/web/cheerio.d.cts", + "document_loaders/web/dropbox.cjs", + "document_loaders/web/dropbox.js", + "document_loaders/web/dropbox.d.ts", + "document_loaders/web/dropbox.d.cts", "document_loaders/web/html.cjs", "document_loaders/web/html.js", "document_loaders/web/html.d.ts", diff --git a/libs/langchain-community/src/document_loaders/tests/dropbox.int.test.ts b/libs/langchain-community/src/document_loaders/tests/dropbox.int.test.ts new file mode 100644 index 000000000000..f87f0d4f4a0b --- /dev/null +++ b/libs/langchain-community/src/document_loaders/tests/dropbox.int.test.ts @@ -0,0 +1,223 @@ +/** + * NOTE: DROPBOX_ACCESS_TOKEN should be set in environment variables + * NOTE: files.content.write permission is required for testing. + */ +import { expect } from "@jest/globals"; +import * as url from "node:url"; +import * as path from "node:path"; +import * as fs from "node:fs"; +import { v4 as uuid } from "uuid"; +import { getEnvironmentVariable } from "@langchain/core/utils/env"; +import { Document } from "@langchain/core/documents"; +import { Dropbox, DropboxOptions } from "dropbox"; + +import { + UnstructuredDirectoryLoader, + UnstructuredLoader, + UnstructuredLoaderOptions, +} from "../fs/unstructured.js"; +import { DropboxLoader } from "../web/dropbox.js"; + +// Copies over the dropbox example_data to the remote dropbox drive. +const setupDropboxStorageEnvironment = async ( + localPath: string, + dropboxPath: string, + dbx: Dropbox +) => { + for (const item of fs.readdirSync(localPath)) { + const fullPath = path.join(localPath, item); + const dbxPath = `${dropboxPath}/${item}`; + const stats = fs.statSync(fullPath); + if (stats.isDirectory()) { + try { + await dbx.filesCreateFolderV2({ path: dbxPath }); + } catch { + // Ignore folder already exists or auth error + } + await setupDropboxStorageEnvironment(fullPath, dbxPath, dbx); + } else { + await dbx.filesUpload({ + path: dbxPath, + contents: fs.readFileSync(fullPath), + mode: { ".tag": "overwrite" }, + }); + } + } +}; + +// Removes the dropbox example_data from the remote dropbox drive. +const teardownDropboxStorageEnvironment = async ( + dropboxPath: string, + dbx: Dropbox +) => { + try { + await dbx.filesDeleteV2({ path: dropboxPath }); + } catch { + // Folder might not exist or auth error. + } +}; + +describe("DropboxLoader Integration Tests", () => { + // Ensure the enviroment variables are set + + const localTestDataFolder = path.resolve( + path.dirname(url.fileURLToPath(import.meta.url)), + "./example_data/dropbox" + ); + const dropboxTestDataFolder = `/LangchainDropboxLoaderTest_${uuid()}`; + const folder1Files = ["example.txt", "example2.txt"]; + const folder2Files = ["Jacob_Lee_Resume_2023.pdf"]; + const rootFiles = ["hello.txt"]; + const allTestFiles = [...rootFiles, ...folder1Files, ...folder2Files]; + + // Copies over the dropbox example_data over to dropbox drive + beforeAll(() => { + const accessToken = getEnvironmentVariable("DROPBOX_ACCESS_TOKEN"); + const dbx = new Dropbox({ accessToken }); + return setupDropboxStorageEnvironment( + localTestDataFolder, + dropboxTestDataFolder, + dbx + ); + }); + + // Cleanup and removes the added dropbox example_date during setup. + afterAll(() => { + const accessToken = getEnvironmentVariable("DROPBOX_ACCESS_TOKEN"); + const dbx = new Dropbox({ accessToken }); + return teardownDropboxStorageEnvironment(dropboxTestDataFolder, dbx); + }); + + // Integration tests for the load method + describe("load", () => { + it("should load documents from a Dropbox file", async () => { + const localFilename = folder2Files[0]; + const dropboxFilePath = path.join( + dropboxTestDataFolder, + "folder_2", + localFilename + ); + const localFilePath = path.join( + localTestDataFolder, + "folder_2", + localFilename + ); + + const unstructuredOptions: UnstructuredLoaderOptions = { + apiKey: undefined, + apiUrl: "http://localhost:8000/general/v0/general", + }; + + const clientOptions: DropboxOptions = {}; + const dropboxLoader = new DropboxLoader({ + clientOptions, + unstructuredOptions, + filePaths: [dropboxFilePath], + }); + + const directoryLoader = new UnstructuredLoader( + localFilePath, + unstructuredOptions + ); + + const [dropboxDocuments, directoryDocuments] = await Promise.all([ + dropboxLoader.load(), + directoryLoader.load(), + ]); + + expect(dropboxDocuments).toBeDefined(); + expect(dropboxDocuments.length).toBe(directoryDocuments.length); + + const dropboxSourcePath = "dropbox://" + dropboxFilePath; + + dropboxDocuments.forEach((doc) => { + expect(doc).toBeInstanceOf(Document); + expect(doc.pageContent).toBeDefined(); + expect(folder2Files).toContain(doc.metadata.filename); + expect(dropboxSourcePath).toEqual(doc.metadata.source); + }); + }); + + it("should load all documents from a Dropbox folder", async () => { + const dropboxFilenames = folder1Files.map((path) => path.toLowerCase()); + + const dropboxFolderPath = path.join(dropboxTestDataFolder, "folder_1"); + const localFolderPath = path.join(localTestDataFolder, "folder_1"); + + const clientOptions: DropboxOptions = {}; + const unstructuredOptions: UnstructuredLoaderOptions = { + apiKey: "", + apiUrl: "http://localhost:8000/general/v0/general", + }; + + const dropboxLoader = new DropboxLoader({ + clientOptions, + unstructuredOptions, + mode: "directory", + folderPath: dropboxFolderPath, + }); + + const directoryLoader = new UnstructuredDirectoryLoader( + localFolderPath, + unstructuredOptions + ); + + const [dropboxDocuments, directoryDocuments] = await Promise.all([ + dropboxLoader.load(), + directoryLoader.load(), + ]); + + expect(dropboxDocuments).toBeDefined(); + expect(dropboxDocuments.length).toBe(directoryDocuments.length); + + const dropboxSourcePath = folder1Files.map( + (filename) => + "dropbox://" + path.join(dropboxFolderPath, filename).toLowerCase() + ); + + dropboxDocuments.forEach((doc) => { + expect(doc).toBeInstanceOf(Document); + expect(doc.pageContent).toBeDefined(); + expect(dropboxFilenames).toContain(doc.metadata.filename); + expect(dropboxSourcePath).toContain(doc.metadata.source); + }); + }); + + it("should recursively load all documents from a Dropbox folder", async () => { + const dropboxFilenames = allTestFiles.map((path) => path.toLowerCase()); + + const clientOptions: DropboxOptions = {}; + const unstructuredOptions: UnstructuredLoaderOptions = { + apiKey: "", + apiUrl: "http://localhost:8000/general/v0/general", + }; + + const dropboxLoader = new DropboxLoader({ + clientOptions, + unstructuredOptions, + mode: "directory", + folderPath: dropboxTestDataFolder, + recursive: true, + }); + + const directoryLoader = new UnstructuredDirectoryLoader( + localTestDataFolder, + unstructuredOptions + ); + + const [dropboxDocuments, directoryDocuments] = await Promise.all([ + dropboxLoader.load(), + directoryLoader.load(), + ]); + + expect(dropboxDocuments).toBeDefined(); + expect(dropboxDocuments.length).toBe(directoryDocuments.length); + + dropboxDocuments.forEach((doc) => { + expect(doc).toBeInstanceOf(Document); + expect(doc.pageContent).toBeDefined(); + expect(dropboxFilenames).toContain(doc.metadata.filename); + }); + }); + }); +}); diff --git a/libs/langchain-community/src/document_loaders/tests/example_data/dropbox/folder_1/example.txt b/libs/langchain-community/src/document_loaders/tests/example_data/dropbox/folder_1/example.txt new file mode 100644 index 000000000000..04861c126cfe --- /dev/null +++ b/libs/langchain-community/src/document_loaders/tests/example_data/dropbox/folder_1/example.txt @@ -0,0 +1,4 @@ +Foo +Bar +Baz + diff --git a/libs/langchain-community/src/document_loaders/tests/example_data/dropbox/folder_1/example2.txt b/libs/langchain-community/src/document_loaders/tests/example_data/dropbox/folder_1/example2.txt new file mode 100644 index 000000000000..e6087a3b49b7 --- /dev/null +++ b/libs/langchain-community/src/document_loaders/tests/example_data/dropbox/folder_1/example2.txt @@ -0,0 +1,4 @@ +Alice +Bob +Mallory + diff --git a/libs/langchain-community/src/document_loaders/tests/example_data/dropbox/folder_2/Jacob_Lee_Resume_2023.pdf b/libs/langchain-community/src/document_loaders/tests/example_data/dropbox/folder_2/Jacob_Lee_Resume_2023.pdf new file mode 100644 index 000000000000..de0724b53771 Binary files /dev/null and b/libs/langchain-community/src/document_loaders/tests/example_data/dropbox/folder_2/Jacob_Lee_Resume_2023.pdf differ diff --git a/libs/langchain-community/src/document_loaders/tests/example_data/dropbox/hello.txt b/libs/langchain-community/src/document_loaders/tests/example_data/dropbox/hello.txt new file mode 100644 index 000000000000..e8cfc9fb65e4 --- /dev/null +++ b/libs/langchain-community/src/document_loaders/tests/example_data/dropbox/hello.txt @@ -0,0 +1,2 @@ +Hello world! +The quick brown fox has jumped over the lazy dog. \ No newline at end of file diff --git a/libs/langchain-community/src/document_loaders/web/dropbox.ts b/libs/langchain-community/src/document_loaders/web/dropbox.ts new file mode 100644 index 000000000000..907f6494e7a8 --- /dev/null +++ b/libs/langchain-community/src/document_loaders/web/dropbox.ts @@ -0,0 +1,336 @@ +import * as fsDefault from "node:fs"; +import { promises as fsPromises } from "node:fs"; +import * as path from "node:path"; +import * as os from "node:os"; +import { files as DropboxFiles } from "dropbox/types/dropbox_types.js"; +import { Dropbox, DropboxOptions, DropboxAuth } from "dropbox"; + +import { BaseDocumentLoader } from "langchain/document_loaders/base"; +import { Document } from "langchain/document"; +import { getEnvironmentVariable } from "@langchain/core/utils/env"; +import { + UnstructuredLoader as UnstructuredLoaderDefault, + UnstructuredLoaderOptions, +} from "../fs/unstructured.js"; + +/** + * Interface representing the configuration options for the {@link DropboxLoader}. + */ +export interface DropboxLoaderConfig { + /** + * Options for initializing the Dropbox client. + * See [Dropbox SDK Documentation](https://dropbox.github.io/dropbox-sdk-js/Dropbox.html#Dropbox__anchor) for details. + */ + clientOptions: DropboxOptions; + /** + * Options for the UnstructuredLoader used to process downloaded files. + */ + unstructuredOptions: UnstructuredLoaderOptions; + /** + * The path to the folder in Dropbox to load files from. + * Defaults to the root folder if not specified. + */ + folderPath?: string; + /** + * Specific file paths in Dropbox to load. + * Required if `mode` is set to `"file"`. + */ + filePaths?: string[]; + /** + * Whether to recursively traverse folders when `mode` is `"directory"`. + * Defaults to `false`. + */ + recursive?: boolean; + /** + * Mode of operation: `"file"` to load specific files, `"directory"` to load all files in a directory. + * Defaults to `"file"`. + */ + mode?: "file" | "directory"; + /** + * The file system module to use. Defaults to Node's `fs` module. + */ + fs?: typeof fsDefault; + /** + * The UnstructuredLoader class to use for processing files. + * Defaults to the UnstructuredLoader provided by `langchain`. + */ + UnstructuredLoader?: typeof UnstructuredLoaderDefault; +} + +/** + * A document loader that retrieves files from Dropbox and processes them into `Document` instances. + * This loader uses the Dropbox API to download files and the `UnstructuredLoader` to process them. + * + * @example + * ```typescript + * import { DropboxLoader } from "langchain/document_loaders/web/dropbox"; + * + * const loader = new DropboxLoader({ + * clientOptions: { + * accessToken: "your-dropbox-access-token", + * }, + * unstructuredOptions: { + * apiUrl: "http://localhost:8000/general/v0/general", + * }, + * folderPath: "/path/to/folder", + * recursive: true, + * mode: "directory", + * }); + * + * const docs = await loader.load(); + * ``` + */ +export class DropboxLoader extends BaseDocumentLoader { + /** + * The Dropbox client instance used to interact with the Dropbox API. + */ + protected dropboxClient: Dropbox; + + /** + * Options for the UnstructuredLoader used to process downloaded files. + */ + protected unstructuredOptions: UnstructuredLoaderOptions; + + /** + * The path to the folder in Dropbox to load files from. + */ + protected folderPath: string; + + /** + * Specific file paths in Dropbox to load. + */ + protected filePaths: string[]; + + /** + * Whether to recursively traverse folders when `mode` is `"directory"`. + */ + protected recursive: boolean; + + /** + * Mode of operation: `"file"` to load specific files, `"directory"` to load all files in a directory. + */ + protected mode: "file" | "directory"; + + /** + * The file system module to use. + */ + protected fs: typeof fsDefault; + + /** + * The UnstructuredLoader class to use for processing files. + */ + protected _UnstructuredLoader: typeof UnstructuredLoaderDefault; + + /** + * Creates an instance of `DropboxLoader`. + * @param config - Configuration options for the loader. + * @throws Will throw an error if `mode` is `"file"` and `filePaths` is not provided or empty. + */ + constructor({ + clientOptions, + unstructuredOptions, + folderPath = "", + filePaths, + recursive = false, + mode = "file", + fs = fsDefault, + UnstructuredLoader = UnstructuredLoaderDefault, + }: DropboxLoaderConfig) { + super(); + + if (mode === "file" && (!filePaths || filePaths.length === 0)) { + throw new Error(`"filePaths" must be set if "mode" is "file".`); + } + + this.unstructuredOptions = unstructuredOptions; + this.folderPath = folderPath; + this.filePaths = filePaths || []; + this.recursive = recursive; + this.mode = mode; + this.fs = fs; + this._UnstructuredLoader = UnstructuredLoader; + + this.dropboxClient = DropboxLoader._getDropboxClient(clientOptions); + } + + /** + * Asynchronously loads documents from Dropbox, yielding each `Document` as it is loaded. + * Useful for handling large numbers of documents without loading them all into memory at once. + * + * @returns An async generator yielding `Document` instances. + */ + public async *loadLazy(): AsyncGenerator { + let paths: string[] = []; + + if (this.mode === "file") { + paths = this.filePaths; + } else if (this.mode === "directory") { + paths = await this._fetchFilePathList(); + } + + for (const filePath of paths) { + const docs = await this._loadFile(filePath); + for (const doc of docs) { + yield doc; + } + } + } + + /** + * Loads all documents from Dropbox based on the specified configuration. + * + * @returns A promise that resolves to an array of `Document` instances. + */ + async load(): Promise { + const documents: Document[] = []; + for await (const doc of this.loadLazy()) { + documents.push(doc); + } + return documents; + } + + /** + * Generates a list of file paths from the specified Dropbox folder that need to be downloaded + * and processed into documents. This method is called only when the loader is operating in + * `"directory"` mode to determine which files should be downloaded and processed. + * + * @returns A promise that resolves to an array of Dropbox file paths to be downloaded and processed. + */ + private async _fetchFilePathList(): Promise { + const client: Dropbox = this.dropboxClient; + const filePaths: string[] = []; + + /** + * Processes entries returned from Dropbox and adds file paths to the list. + * @param entries - Array of Dropbox metadata entries. + */ + const processEntries = (entries: DropboxFiles.MetadataReference[]) => { + entries + .filter((entry) => entry[".tag"] === "file") + .forEach((fileEntry) => { + if (fileEntry.path_lower) filePaths.push(fileEntry.path_lower); + }); + }; + + try { + let listFolderResponse = await client.filesListFolder({ + path: this.folderPath, + recursive: this.recursive, + }); + + processEntries(listFolderResponse.result.entries); + while (listFolderResponse.result.has_more) { + listFolderResponse = await client.filesListFolderContinue({ + cursor: listFolderResponse.result.cursor, + }); + processEntries(listFolderResponse.result.entries); + } + + return filePaths; + } catch (error) { + console.error(`Error listing files in folder ${this.folderPath}:`, error); + return []; + } + } + + /** + * Downloads a file from Dropbox, processes it into `Document` instances using the `UnstructuredLoader`, + * and returns the resulting documents. This method handles the entire lifecycle of the file processing, + * including downloading, temporary storage, processing, metadata augmentation, and cleanup. + * + * @param filePath - The path to the file in Dropbox. + * @returns A promise that resolves to an array of `Document` instances generated from a dropbox file. + */ + private async _loadFile(filePath: string): Promise { + const client: Dropbox = this.dropboxClient; + let tempDir: string | undefined; + let localFilePath: string | undefined; + try { + const fetchRes = await client.filesDownload({ path: filePath }); + const fileMetadata = + fetchRes.result as DropboxFiles.FileMetadataReference & { + fileBinary: Buffer; + }; + + if (!fileMetadata.fileBinary) { + throw new Error(`Failed to download file: ${filePath}`); + } + + const fileBinary = fileMetadata.fileBinary; + + // Create temporary directory + tempDir = await fsPromises.mkdtemp( + path.join(os.tmpdir(), "dropboxfileloader-") + ); + + // Normalize the file path + const normalizedFilePath = filePath.startsWith("/") + ? filePath.slice(1) + : filePath; + localFilePath = path.join(tempDir, normalizedFilePath); + + await fsPromises.mkdir(path.dirname(localFilePath), { recursive: true }); + + await fsPromises.writeFile(localFilePath, fileBinary, { + encoding: "binary", + }); + + // Create an unstructured loader and load the file. + const unstructuredLoader = new this._UnstructuredLoader( + localFilePath, + this.unstructuredOptions + ); + const docs = await unstructuredLoader.load(); + + // Set the source metadata for each document. + const sourceMetadata = { source: `dropbox://${filePath}` }; + for (const doc of docs) { + doc.metadata = { ...doc.metadata, ...sourceMetadata }; + } + + return docs; + } catch (error) { + console.error(`Error processing file ${filePath}:`, error); + console.error(`File ${filePath} was skipped.`); + return []; // Proceed to the next file + } finally { + // Cleanup temporary files + if (localFilePath) { + try { + await fsPromises.unlink(localFilePath); + } catch (err) { + console.warn(`Failed to delete file ${localFilePath}:`, err); + } + } + if (tempDir) { + try { + await fsPromises.rm(tempDir, { recursive: true, force: true }); + } catch (err) { + console.warn(`Failed to delete temp directory ${tempDir}:`, err); + } + } + } + } + + /** + * Creates and returns a Dropbox client instance configured with the provided options. + * If authentication details are not specified in `clientOptions`, it attempts to use + * the `DROPBOX_ACCESS_TOKEN` environment variable for authentication. + * + * @param clientOptions - Configuration options for initializing the Dropbox client, + * including authentication details. + * @returns An instance of the Dropbox client. + */ + private static _getDropboxClient(clientOptions: DropboxOptions): Dropbox { + const options = clientOptions || {}; + if (options.auth || options.accessToken) { + return new Dropbox(clientOptions); + } + const accessToken = getEnvironmentVariable("DROPBOX_ACCESS_TOKEN"); + const auth = new DropboxAuth({ + ...clientOptions, + accessToken, + }); + return new Dropbox({ ...clientOptions, auth }); + } +} diff --git a/libs/langchain-community/src/load/import_constants.ts b/libs/langchain-community/src/load/import_constants.ts index 722dd82e678b..a2188769baa5 100644 --- a/libs/langchain-community/src/load/import_constants.ts +++ b/libs/langchain-community/src/load/import_constants.ts @@ -147,6 +147,7 @@ export const optionalImportEntrypoints: string[] = [ "langchain_community/document_loaders/web/azure_blob_storage_file", "langchain_community/document_loaders/web/browserbase", "langchain_community/document_loaders/web/cheerio", + "langchain_community/document_loaders/web/dropbox", "langchain_community/document_loaders/web/puppeteer", "langchain_community/document_loaders/web/playwright", "langchain_community/document_loaders/web/college_confidential", diff --git a/yarn.lock b/yarn.lock index 5e6b6ed60a57..9b4d549ced33 100644 --- a/yarn.lock +++ b/yarn.lock @@ -11799,6 +11799,7 @@ __metadata: dotenv: ^16.0.3 dpdm: ^3.12.0 dria: ^0.0.3 + dropbox: ^10.34.0 duck-duck-scrape: ^2.2.5 epub2: ^3.0.1 eslint: ^8.33.0 @@ -25893,6 +25894,17 @@ __metadata: languageName: node linkType: hard +"dropbox@npm:^10.34.0": + version: 10.34.0 + resolution: "dropbox@npm:10.34.0" + dependencies: + node-fetch: ^2.6.1 + peerDependencies: + "@types/node-fetch": ^2.5.7 + checksum: 5474e31b0e9c8ff68d140a12b8ef5a06e17125f254ad47702e8ab6932fb0405b467c58098e4e6546e29df57f6fa7581f749b6eb0c2a6daec29644a5a8f5cba75 + languageName: node + linkType: hard + "duck-duck-scrape@npm:^2.2.5": version: 2.2.5 resolution: "duck-duck-scrape@npm:2.2.5"