-
Notifications
You must be signed in to change notification settings - Fork 1
/
prepare_db.py
122 lines (91 loc) · 3.66 KB
/
prepare_db.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
import os
import sys
import pinecone
# So many different loader types!
from langchain.document_loaders import DirectoryLoader
from langchain.embeddings.openai import OpenAIEmbeddings
# See this for a discussion on the different splitters: https://github.com/hwchase17/langchain/discussions/3786
from langchain.text_splitter import (
CharacterTextSplitter,
RecursiveCharacterTextSplitter,
TokenTextSplitter,
)
from langchain.vectorstores import Pinecone, VectorStore
### OpenAI API Keys
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") # platform.openai.com/account/api-keys
if OPENAI_API_KEY:
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY
# TODO: Improve in the future by using getpass
# os.environ["OPENAI_API_KEY"] = getpass.getpass("OpenAI API Key:")
## Pinecone API Keys
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY") # app.pinecone.io
PINECONE_ENV = os.getenv(
"PINECONE_ENV", "us-west4-gcp-free"
) # next to api key in console
### Data Sources
# TODO: Extend to non pdf files in the future as well
ALL_PDFS_PATTERN = "**/*.pdf" # only retrieve PDFs
### Constants
CHUNK_SIZE = 1000 # number of characters per chunk
CHUNK_OVERLAP = 0 # number of characters to overlap between chunks
OPEN_AI_EMBEDDING = "text-embedding-ada-002" # Default embedding from https://platform.openai.com/docs/guides/embeddings/what-are-embeddings
OPEN_AI_EMBEDDING_CTX_LENGTH = 8191 # Default context length from source code
def get_embeddings() -> OpenAIEmbeddings:
return OpenAIEmbeddings(
model=OPEN_AI_EMBEDDING, embedding_ctx_length=OPEN_AI_EMBEDDING_CTX_LENGTH
)
def pinecone_init():
pinecone.init(api_key=PINECONE_API_KEY, environment=PINECONE_ENV)
def pinecone_db(index_name) -> VectorStore:
pinecone_init()
return Pinecone.from_existing_index(
index_name=index_name, embedding=get_embeddings()
)
def get_embeddings() -> OpenAIEmbeddings:
return OpenAIEmbeddings(
model=OPEN_AI_EMBEDDING, embedding_ctx_length=OPEN_AI_EMBEDDING_CTX_LENGTH
)
# 1. Load documents from a directory matching the pattern provided
# 2. Split documents into chunks
# 3. Create embeddings for each chunk
# 4. Create a Pinecone index from the embeddings
# 5. Return the Pinecone index db
def prepare_db(data_source: str, index_name: str, pattern: str) -> VectorStore:
loader = DirectoryLoader(
data_source,
glob=pattern,
show_progress=True,
)
docs = loader.load()
# Split PDFs into chunks (1 chunk = 1 embedding)
# TODO: Investigate the difference between this and RecursiveCharacterTextSplitter
# text_splitter = CharacterTextSplitter(
# chunk_size=CHUNK_SIZE,
# chunk_overlap=CHUNK_OVERLAP,
# # length_function=len,
# # keep_separator=True,
# # add_start_index=True,
# )
# Split PDFs into chunks (1 chunk = 1 embedding)
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=CHUNK_SIZE,
chunk_overlap=CHUNK_OVERLAP,
separators=[" ", ",", "\n"]
# length_function=len,
# keep_separator=True,
# add_start_index=True,
)
docs_split = text_splitter.split_documents(docs)
embeddings = get_embeddings()
pinecone_init()
doc_db = Pinecone.from_documents(docs_split, embeddings, index_name=index_name)
return doc_db
def main(datadir, index_name):
print("args", datadir, index_name)
doc_db = prepare_db(datadir, index_name, ALL_PDFS_PATTERN)
print(f"doc_db for {datadir} initialized at index {index_name}", doc_db)
if __name__ == "__main__":
if len(sys.argv) != 3:
print("Usage: python prepare_db.py <data_source> <pattern>")
sys.exit(1)
main(sys.argv[1], sys.argv[2])