Skip to content

Commit

Permalink
feat(community): Add AirtableLoader to load documents from Airtable w…
Browse files Browse the repository at this point in the history
…ith retry and pagination handling (#7106)

Co-authored-by: Jacob Lee <[email protected]>
  • Loading branch information
SkSirius and jacoblee93 authored Nov 12, 2024
1 parent 306f31e commit 18b1810
Show file tree
Hide file tree
Showing 9 changed files with 504 additions and 5 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
---
hide_table_of_contents: true
---

import loadExample from "@examples/document_loaders/airtable_load";
import CodeBlock from "@theme/CodeBlock";

# AirtableLoader

The `AirtableLoader` class provides functionality to load documents from Airtable tables. It supports two main methods:

1. `load()`: Retrieves all records at once, ideal for small to moderate datasets.
2. `loadLazy()`: Fetches records one by one, which is more memory-efficient for large datasets.

## Prerequisites

Ensure that your Airtable API token is available as an environment variable:

```typescript
process.env.AIRTABLE_API_TOKEN = "YOUR_AIRTABLE_API_TOKEN";
```

## Usage

<CodeBlock language="typescript">{loadExample}</CodeBlock>
44 changes: 44 additions & 0 deletions examples/src/document_loaders/airtable_load.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
import { AirtableLoader } from "@langchain/community/document_loaders/web/airtable";
import { Document } from "@langchain/core/documents";

// Default airtable loader
const loader = new AirtableLoader({
tableId: "YOUR_TABLE_ID",
baseId: "YOUR_BASE_ID",
});

try {
const documents: Document[] = await loader.load();
console.log("Loaded documents:", documents);
} catch (error) {
console.error("Error loading documents:", error);
}

// Lazy airtable loader
const loaderLazy = new AirtableLoader({
tableId: "YOUR_TABLE_ID",
baseId: "YOUR_BASE_ID",
});

try {
console.log("Lazily loading documents:");
for await (const document of loader.loadLazy()) {
console.log("Loaded document:", document);
}
} catch (error) {
console.error("Error loading documents lazily:", error);
}

// Airtable loader with specific view
const loaderView = new AirtableLoader({
tableId: "YOUR_TABLE_ID",
baseId: "YOUR_BASE_ID",
kwargs: { view: "YOUR_VIEW_NAME" },
});

try {
const documents: Document[] = await loader.load();
console.log("Loaded documents with view:", documents);
} catch (error) {
console.error("Error loading documents with view:", error);
}
4 changes: 4 additions & 0 deletions libs/langchain-community/.gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -998,6 +998,10 @@ document_loaders/fs/pptx.cjs
document_loaders/fs/pptx.js
document_loaders/fs/pptx.d.ts
document_loaders/fs/pptx.d.cts
document_loaders/web/airtable.cjs
document_loaders/web/airtable.js
document_loaders/web/airtable.d.ts
document_loaders/web/airtable.d.cts
utils/convex.cjs
utils/convex.js
utils/convex.d.ts
Expand Down
14 changes: 9 additions & 5 deletions libs/langchain-community/langchain.config.js
Original file line number Diff line number Diff line change
Expand Up @@ -189,7 +189,8 @@ export const config = {
// callbacks
"callbacks/handlers/llmonitor": "callbacks/handlers/llmonitor",
"callbacks/handlers/lunary": "callbacks/handlers/lunary",
"callbacks/handlers/upstash_ratelimit": "callbacks/handlers/upstash_ratelimit",
"callbacks/handlers/upstash_ratelimit":
"callbacks/handlers/upstash_ratelimit",
// retrievers
"retrievers/amazon_kendra": "retrievers/amazon_kendra",
"retrievers/amazon_knowledge_base": "retrievers/amazon_knowledge_base",
Expand Down Expand Up @@ -260,6 +261,7 @@ export const config = {
"indexes/memory": "indexes/memory",
"indexes/sqlite": "indexes/sqlite",
// document_loaders
"document_loaders/web/airtable": "document_loaders/web/airtable",
"document_loaders/web/apify_dataset": "document_loaders/web/apify_dataset",
"document_loaders/web/assemblyai": "document_loaders/web/assemblyai",
"document_loaders/web/azure_blob_storage_container":
Expand Down Expand Up @@ -310,18 +312,20 @@ export const config = {
"utils/event_source_parse": "utils/event_source_parse",
"utils/cassandra": "utils/cassandra",
// experimental
"experimental/callbacks/handlers/datadog": "experimental/callbacks/handlers/datadog",
"experimental/callbacks/handlers/datadog":
"experimental/callbacks/handlers/datadog",
"experimental/graph_transformers/llm":
"experimental/graph_transformers/llm",
"experimental/multimodal_embeddings/googlevertexai":
"experimental/multimodal_embeddings/googlevertexai",
"experimental/hubs/makersuite/googlemakersuitehub":
"experimental/hubs/makersuite/googlemakersuitehub",
"experimental/chat_models/ollama_functions": "experimental/chat_models/ollama_functions",
"experimental/chat_models/ollama_functions":
"experimental/chat_models/ollama_functions",
"experimental/llms/chrome_ai": "experimental/llms/chrome_ai",
"experimental/tools/pyinterpreter": "experimental/tools/pyinterpreter",
// chains
"chains/graph_qa/cypher": "chains/graph_qa/cypher"
"chains/graph_qa/cypher": "chains/graph_qa/cypher",
},
requiresOptionalDependency: [
"tools/aws_sfn",
Expand Down Expand Up @@ -520,7 +524,7 @@ export const config = {
// chains
"chains/graph_qa/cypher",
// langgraph checkpointers
"langgraph/checkpointers/vercel_kv"
"langgraph/checkpointers/vercel_kv",
],
packageSuffix: "community",
tsConfigPath: resolve("./tsconfig.json"),
Expand Down
13 changes: 13 additions & 0 deletions libs/langchain-community/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -2958,6 +2958,15 @@
"import": "./document_loaders/fs/pptx.js",
"require": "./document_loaders/fs/pptx.cjs"
},
"./document_loaders/web/airtable": {
"types": {
"import": "./document_loaders/web/airtable.d.ts",
"require": "./document_loaders/web/airtable.d.cts",
"default": "./document_loaders/web/airtable.d.ts"
},
"import": "./document_loaders/web/airtable.js",
"require": "./document_loaders/web/airtable.cjs"
},
"./utils/convex": {
"types": {
"import": "./utils/convex.d.ts",
Expand Down Expand Up @@ -4061,6 +4070,10 @@
"document_loaders/fs/pptx.js",
"document_loaders/fs/pptx.d.ts",
"document_loaders/fs/pptx.d.cts",
"document_loaders/web/airtable.cjs",
"document_loaders/web/airtable.js",
"document_loaders/web/airtable.d.ts",
"document_loaders/web/airtable.d.cts",
"utils/convex.cjs",
"utils/convex.js",
"utils/convex.d.ts",
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
/**
* NOTE: AIRTABLE_API_TOKEN should be set in environment variables
*/
import { Document } from "@langchain/core/documents";
import { AirtableLoader } from "../web/airtable.js";

describe("AirtableLoader Integration Tests", () => {
// Ensure that the environment variables are set

const baseId = "BASE_ID";
const tableId = "TABLE_ID";

// Integration tests for the load method
describe("load", () => {
it("should load documents from Airtable", async () => {
const loader = new AirtableLoader({ tableId, baseId });

const documents = await loader.load();

expect(documents).toBeDefined();
expect(documents.length).toBeGreaterThan(0);

documents.forEach((doc) => {
expect(doc).toBeInstanceOf(Document);
expect(doc.pageContent).toBeDefined();
expect(doc.metadata).toMatchObject({
source: `${baseId}_${tableId}`,
base_id: baseId,
table_id: tableId,
});
});
}, 20000);
});

// Integration tests for the loadLazy method
describe("loadLazy", () => {
it("should lazily load documents from Airtable", async () => {
const loader = new AirtableLoader({ tableId, baseId });

const documents: Document[] = [];
for await (const doc of loader.loadLazy()) {
documents.push(doc);
}

expect(documents).toBeDefined();
expect(documents.length).toBeGreaterThan(0);

documents.forEach((doc) => {
expect(doc).toBeInstanceOf(Document);
expect(doc.pageContent).toBeDefined();
expect(doc.metadata).toMatchObject({
source: `${baseId}_${tableId}`,
base_id: baseId,
table_id: tableId,
});
});
}, 20000);
});
});
177 changes: 177 additions & 0 deletions libs/langchain-community/src/document_loaders/tests/airtable.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,177 @@
/* eslint-disable @typescript-eslint/no-explicit-any */
/* eslint-disable no-process-env */
import { Document } from "@langchain/core/documents";
import { expect, jest } from "@jest/globals";
import { AirtableLoader } from "../web/airtable.js";

// Mock the global fetch function
(global as any).fetch = jest.fn();

describe("AirtableLoader", () => {
beforeEach(() => {
jest.clearAllMocks();
process.env.AIRTABLE_API_TOKEN = "foobar";
});

// Tests for the load method
describe("load", () => {
it("should load documents correctly", async () => {
const loader = new AirtableLoader({
tableId: "tableId",
baseId: "baseId",
kwargs: { view: "test-view" },
});

// Spy on the private fetchRecords method
const mockFetchRecords = jest.spyOn(loader as any, "fetchRecords");

// Mock data to be returned by fetchRecords
const mockRecords = [
{
id: "rec1",
fields: { Name: "Record 1" },
createdTime: "2021-01-01T00:00:00.000Z",
},
{
id: "rec2",
fields: { Name: "Record 2" },
createdTime: "2021-01-02T00:00:00.000Z",
},
];

// Mock the resolved value of fetchRecords
mockFetchRecords.mockResolvedValue({ records: mockRecords });

const documents = await loader.load();

expect(documents).toHaveLength(2);
expect(documents[0].pageContent).toBe(JSON.stringify(mockRecords[0]));
expect(documents[1].pageContent).toBe(JSON.stringify(mockRecords[1]));
expect(mockFetchRecords).toHaveBeenCalledTimes(1);
});

it("should handle pagination correctly", async () => {
const loader = new AirtableLoader({
tableId: "tableId",
baseId: "baseId",
});

const mockFetchRecords = jest.spyOn(loader as any, "fetchRecords");
const mockRecordsPage1 = [
{
id: "rec1",
fields: { Name: "Record 1" },
createdTime: "2021-01-01T00:00:00.000Z",
},
];
const mockRecordsPage2 = [
{
id: "rec2",
fields: { Name: "Record 2" },
createdTime: "2021-01-02T00:00:00.000Z",
},
];

// Mock fetchRecords to simulate pagination
mockFetchRecords
.mockResolvedValueOnce({
records: mockRecordsPage1,
offset: "next-page",
})
.mockResolvedValueOnce({ records: mockRecordsPage2 });

const documents = await loader.load();

expect(documents).toHaveLength(2);
expect(documents[0].pageContent).toBe(
JSON.stringify(mockRecordsPage1[0])
);
expect(documents[1].pageContent).toBe(
JSON.stringify(mockRecordsPage2[0])
);
expect(mockFetchRecords).toHaveBeenCalledTimes(2);
});

it("should retry fetchRecords on failure", async () => {
const loader = new AirtableLoader({
tableId: "tableId",
baseId: "baseId",
});

const mockFetchRecords = jest.spyOn(loader as any, "fetchRecords");
const mockError = new Error("Network Error");
const mockRecords = [
{
id: "rec1",
fields: { Name: "Record 1" },
createdTime: "2021-01-01T00:00:00.000Z",
},
];

// Simulate a failure on the first call and success on the second
mockFetchRecords
.mockRejectedValueOnce(mockError)
.mockResolvedValueOnce({ records: mockRecords });

const documents = await loader.load();

expect(documents).toHaveLength(1);
expect(documents[0].pageContent).toBe(JSON.stringify(mockRecords[0]));
expect(mockFetchRecords).toHaveBeenCalledTimes(2);
});
});

// Tests for the loadLazy method
describe("loadLazy", () => {
it("should yield documents correctly", async () => {
const loader = new AirtableLoader({
tableId: "tableId",
baseId: "baseId",
});

const mockFetchRecords = jest.spyOn(loader as any, "fetchRecords");
const mockRecords = [
{
id: "rec1",
fields: { Name: "Record 1" },
createdTime: "2021-01-01T00:00:00.000Z",
},
{
id: "rec2",
fields: { Name: "Record 2" },
createdTime: "2021-01-02T00:00:00.000Z",
},
];

mockFetchRecords.mockResolvedValue({ records: mockRecords });

const documents: Document[] = [];
for await (const doc of loader.loadLazy()) {
documents.push(doc);
}

expect(documents).toHaveLength(2);
expect(documents[0].pageContent).toBe(JSON.stringify(mockRecords[0]));
expect(documents[1].pageContent).toBe(JSON.stringify(mockRecords[1]));
expect(mockFetchRecords).toHaveBeenCalledTimes(1);
});

it("should handle errors in loadLazy", async () => {
const loader = new AirtableLoader({
tableId: "tableId",
baseId: "baseId",
});

const mockFetchRecords = jest.spyOn(loader as any, "fetchRecords");
const mockError = new Error("Network Error");

mockFetchRecords.mockRejectedValue(mockError);

const iterator = loader.loadLazy();
await expect(iterator.next()).rejects.toThrow(
"Failed to load Airtable records lazily"
);
expect(mockFetchRecords).toHaveBeenCalled();
});
});
});
Loading

0 comments on commit 18b1810

Please sign in to comment.