Skip to content

Commit

Permalink
Upstream changes for 0.4.0 release (#53)
Browse files Browse the repository at this point in the history
0.4.0 release
  • Loading branch information
sumitkbh committed Feb 22, 2024
1 parent 27d5c7c commit 91d637a
Show file tree
Hide file tree
Showing 84 changed files with 9,645 additions and 322 deletions.
24 changes: 23 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,28 @@ All notable changes to this project will be documented in this file.

The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## [0.4.0] - 2024-02-22

### Added

- [New dedicated notebooks](./docs/rag/jupyter_server.md) showcasing usage of cloud based Nvidia AI Playground based models using Langchain connectors as well as local model deployment using Huggingface.
- Upgraded milvus container version to enable GPU accelerated vector search.
- Added support to interact with models behind NeMo Inference Microservices using new model engines `nemo-embed` and `nemo-infer`.
- Added support to provide example specific collection name for vector databases using an environment variable named `COLLECTION_NAME`.
- Added `faiss` as a generic vector database solution behind `utils.py`.

### Changed

- Upgraded and changed base containers for all components to pytorch `23.12-py3`.
- Added langchain specific vector database connector in `utils.py`.
- Changed speech support to use single channel for Riva ASR and TTS.
- Changed `get_llm` utility in `utils.py` to return Langchain wrapper instead of Llmaindex wrappers.

### Fixed

- Fixed a bug causing empty rating in evaluation notebook
- Fixed document search implementation of query decomposition example.

## [0.3.0] - 2024-01-22

### Added
Expand Down Expand Up @@ -53,4 +75,4 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
### Fixed

- [Fixed issue #13](https://github.com/NVIDIA/GenerativeAIExamples/issues/13) of pipeline not able to answer questions unrelated to knowledge base
- [Fixed issue #12](https://github.com/NVIDIA/GenerativeAIExamples/issues/12) typechecking while uploading PDF files
- [Fixed issue #12](https://github.com/NVIDIA/GenerativeAIExamples/issues/12) typechecking while uploading PDF files
6 changes: 4 additions & 2 deletions RetrievalAugmentedGeneration/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,22 +1,24 @@
ARG BASE_IMAGE_URL=nvcr.io/nvidia/pytorch
ARG BASE_IMAGE_TAG=23.08-py3
ARG BASE_IMAGE_TAG=23.12-py3

FROM ${BASE_IMAGE_URL}:${BASE_IMAGE_TAG}

ARG EXAMPLE_NAME
COPY RetrievalAugmentedGeneration/__init__.py /opt/RetrievalAugmentedGeneration/
COPY RetrievalAugmentedGeneration/common /opt/RetrievalAugmentedGeneration/common
COPY RetrievalAugmentedGeneration/examples/${EXAMPLE_NAME} /opt/RetrievalAugmentedGeneration/example
COPY integrations /opt/integrations
COPY tools /opt/tools
RUN apt-get update && apt-get install -y libpq-dev
RUN --mount=type=bind,source=RetrievalAugmentedGeneration/requirements.txt,target=/opt/requirements.txt \
python3 -m pip install --no-cache-dir -r /opt/requirements.txt

COPY RetrievalAugmentedGeneration/examples/${EXAMPLE_NAME} /opt/RetrievalAugmentedGeneration/example
RUN if [ -f "/opt/RetrievalAugmentedGeneration/example/requirements.txt" ] ; then \
python3 -m pip install --no-cache-dir -r /opt/RetrievalAugmentedGeneration/example/requirements.txt ; else \
echo "Skipping example dependency installation, since requirements.txt was not found" ; \
fi

RUN apt-get remove python3-pip

WORKDIR /opt
ENTRYPOINT ["uvicorn", "RetrievalAugmentedGeneration.common.server:app"]
5 changes: 5 additions & 0 deletions RetrievalAugmentedGeneration/common/configuration.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,11 @@ class EmbeddingConfig(ConfigWizard):
default=1024,
help_txt="The required dimensions of the embedding model. Currently utilized for vector DB indexing.",
)
server_url: str = configfield(
"server_url",
default="localhost:9080",
help_txt="The url of the server hosting nemo embedding model",
)


@configclass
Expand Down
109 changes: 97 additions & 12 deletions RetrievalAugmentedGeneration/common/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
import base64
import logging
from functools import lru_cache
from urllib.parse import urlparse
from typing import TYPE_CHECKING, List, Optional

logger = logging.getLogger(__name__)
Expand All @@ -33,7 +34,7 @@
logger.error(f"psycogp2 import failed with error: {e}")

try:
from sqlalchemy import make_url
from sqlalchemy.engine.url import make_url
except Exception as e:
logger.error(f"SQLalchemy import failed with error: {e}")

Expand All @@ -55,15 +56,33 @@
try:
from langchain.text_splitter import SentenceTransformersTokenTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
except Exception as e:
logger.error(f"Langchain import failed with error: {e}")

try:
from langchain_core.vectorstores import VectorStore
except Exception as e:
logger.error(f"Langchain core import failed with error: {e}")

try:
from langchain_community.vectorstores import PGVector
from langchain_community.vectorstores import Milvus
except Exception as e:
logger.error(f"Langchain community import failed with error: {e}")

try:
from langchain_nvidia_ai_endpoints import ChatNVIDIA, NVIDIAEmbeddings
from langchain_community.chat_models import ChatOpenAI
except Exception as e:
logger.error(f"NVIDIA AI connector import failed with error: {e}")

from langchain_core.embeddings import Embeddings
from langchain_core.language_models.chat_models import SimpleChatModel
from langchain.llms.base import LLM
from integrations.langchain.llms.triton_trt_llm import TensorRTLLM
from integrations.langchain.llms.nemo_infer import NemoInfer
from integrations.langchain.embeddings.nemo_embed import NemoEmbeddings
from RetrievalAugmentedGeneration.common import configuration

if TYPE_CHECKING:
Expand Down Expand Up @@ -102,8 +121,10 @@ def _postprocess_nodes(
@lru_cache
def set_service_context() -> None:
"""Set the global service context."""
llm = LangChainLLM(get_llm())
embedding = LangchainEmbedding(get_embedding_model())
service_context = ServiceContext.from_defaults(
llm=get_llm(), embed_model=get_embedding_model()
llm=llm, embed_model=embedding
)
set_global_service_context(service_context)

Expand All @@ -119,15 +140,19 @@ def get_config() -> "ConfigWizard":


@lru_cache
def get_vector_index() -> VectorStoreIndex:
def get_vector_index(collection_name: str = "") -> VectorStoreIndex:
"""Create the vector db index."""
config = get_config()
vector_store = None

logger.info(f"Using {config.vector_store.name} as vector store")

if config.vector_store.name == "pgvector":
db_name = os.getenv('POSTGRES_DB', 'vector_db')
db_name = os.getenv('POSTGRES_DB', None)
if not collection_name:
collection_name = os.getenv('COLLECTION_NAME', "vector_db")
connection_string = f"postgresql://{os.getenv('POSTGRES_USER', '')}:{os.getenv('POSTGRES_PASSWORD', '')}@{config.vector_store.url}/{db_name}"
logger.info(f"Using PGVector collection: {collection_name}")

conn = psycopg2.connect(connection_string)
conn.autocommit = True
Expand All @@ -146,21 +171,60 @@ def get_vector_index() -> VectorStoreIndex:
password=url.password,
port=url.port,
user=url.username,
table_name="document_store",
embed_dim=config.embeddings.dimensions,
table_name=collection_name,
embed_dim=config.embeddings.dimensions
)
elif config.vector_store.name == "milvus":
if not collection_name:
collection_name = os.getenv('COLLECTION_NAME', "vector_db")
logger.info(f"Using milvus collection: {collection_name}")
vector_store = MilvusVectorStore(uri=config.vector_store.url,
dim=config.embeddings.dimensions,
collection_name="document_store_ivfflat",
index_config={"index_type": "IVF_FLAT", "nlist": config.vector_store.nlist},
collection_name=collection_name,
index_config={"index_type": "GPU_IVF_FLAT", "nlist": config.vector_store.nlist},
search_config={"nprobe": config.vector_store.nprobe},
overwrite=False)
else:
raise RuntimeError("Unable to find any supported Vector Store DB. Supported engines are milvus and pgvector.")
return VectorStoreIndex.from_vector_store(vector_store)


def get_vectorstore_langchain(documents, document_embedder, collection_name: str = "") -> VectorStore:
"""Create the vector db index for langchain."""

config = get_config()

if config.vector_store.name == "faiss":
vectorstore = FAISS.from_documents(documents, document_embedder)
elif config.vector_store.name == "pgvector":
db_name = os.getenv('POSTGRES_DB', None)
if not collection_name:
collection_name = os.getenv('COLLECTION_NAME', "vector_db")
logger.info(f"Using PGVector collection: {collection_name}")
connection_string = f"postgresql://{os.getenv('POSTGRES_USER', '')}:{os.getenv('POSTGRES_PASSWORD', '')}@{config.vector_store.url}/{db_name}"
vectorstore = PGVector.from_documents(
embedding=document_embedder,
documents=documents,
collection_name=collection_name,
connection_string=connection_string,
)
elif config.vector_store.name == "milvus":
if not collection_name:
collection_name = os.getenv('COLLECTION_NAME', "vector_db")
logger.info(f"Using milvus collection: {collection_name}")
url = urlparse(config.vector_store.url)
vectorstore = Milvus.from_documents(
documents,
document_embedder,
collection_name=collection_name,
connection_args={"host": url.hostname, "port": url.port}
)
else:
raise ValueError(f"{config.vector_store.name} vector database is not supported")
logger.info("Vector store created and saved.")
return vectorstore


@lru_cache
def get_doc_retriever(num_nodes: int = 4) -> "BaseRetriever":
"""Create the document retriever."""
Expand All @@ -169,7 +233,7 @@ def get_doc_retriever(num_nodes: int = 4) -> "BaseRetriever":


@lru_cache
def get_llm() -> LangChainLLM:
def get_llm() -> LLM | SimpleChatModel:
"""Create the LLM connection."""
settings = get_config()

Expand All @@ -180,15 +244,30 @@ def get_llm() -> LangChainLLM:
model_name=settings.llm.model_name,
tokens=DEFAULT_NUM_TOKENS,
)
return LangChainLLM(llm=trtllm)
return trtllm
elif settings.llm.model_engine == "nv-ai-foundation":
return ChatNVIDIA(model=settings.llm.model_name)
elif settings.llm.model_engine == "nemo-infer":
nemo_infer = NemoInfer(
server_url=f"http://{settings.llm.server_url}/v1/completions",
model=settings.llm.model_name,
tokens=DEFAULT_NUM_TOKENS,
)
return nemo_infer
elif settings.llm.model_engine == "nemo-infer-openai":
nemo_infer = ChatOpenAI(
openai_api_base=f"http://{settings.llm.server_url}/v1/",
openai_api_key="xyz",
model_name=settings.llm.model_name,
max_tokens=DEFAULT_NUM_TOKENS,
)
return nemo_infer
else:
raise RuntimeError("Unable to find any supported Large Language Model server. Supported engines are triton-trt-llm and nv-ai-foundation.")


@lru_cache
def get_embedding_model() -> LangchainEmbedding:
def get_embedding_model() -> Embeddings:
"""Create the embedding model."""
model_kwargs = {"device": "cpu"}
if torch.cuda.is_available():
Expand All @@ -205,9 +284,15 @@ def get_embedding_model() -> LangchainEmbedding:
encode_kwargs=encode_kwargs,
)
# Load in a specific embedding model
return LangchainEmbedding(hf_embeddings)
return hf_embeddings
elif settings.embeddings.model_engine == "nv-ai-foundation":
return NVIDIAEmbeddings(model=settings.embeddings.model_name, model_type="passage")
elif settings.embeddings.model_engine == "nemo-embed":
nemo_embed = NemoEmbeddings(
server_url=f"http://{settings.embeddings.server_url}/v1/embeddings",
model_name=settings.embeddings.model_name,
)
return nemo_embed
else:
raise RuntimeError("Unable to find any supported embedding model. Supported engine is huggingface.")

Expand Down
19 changes: 14 additions & 5 deletions RetrievalAugmentedGeneration/examples/developer_rag/chains.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@
from llama_index.query_engine import RetrieverQueryEngine
from llama_index.response.schema import StreamingResponse
from llama_index.node_parser import LangchainNodeParser
from llama_index.llms import LangChainLLM
from llama_index.embeddings import LangchainEmbedding

from RetrievalAugmentedGeneration.common.utils import (
LimitRetrievedNodesLength,
Expand Down Expand Up @@ -91,7 +93,8 @@ def llm_chain(self, context: str, question: str, num_tokens: int) -> Generator[s
)

logger.info(f"Prompt used for response generation: {prompt}")
response = get_llm().stream_complete(prompt, tokens=num_tokens)
llm = LangChainLLM(get_llm())
response = llm.stream_complete(prompt, tokens=num_tokens)
gen_response = (resp.delta for resp in response)
return gen_response

Expand All @@ -101,10 +104,16 @@ def rag_chain(self, prompt: str, num_tokens: int) -> Generator[str, None, None]:
logger.info("Using rag to generate response from document")

set_service_context()
if get_config().llm.model_engine == "triton-trt-llm":
get_llm().llm.tokens = num_tokens # type: ignore
else:
get_llm().llm.max_tokens = num_tokens
llm = LangChainLLM(get_llm())

try:
if get_config().llm.model_engine == "triton-trt-llm" or get_config().llm.model_engine == "nemo-infer":
llm.llm.tokens = num_tokens # type: ignore
else:
llm.llm.max_tokens = num_tokens
except Exception as e:
logger.error(f"Exception in setting llm tokens: {e}")

retriever = get_doc_retriever(num_nodes=4)
qa_template = Prompt(get_config().prompts.rag_template)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
from langchain_core.prompts import ChatPromptTemplate
from langchain_nvidia_ai_endpoints import ChatNVIDIA, NVIDIAEmbeddings
from RetrievalAugmentedGeneration.common.base import BaseExample
from RetrievalAugmentedGeneration.common.utils import get_config, get_llm, get_embedding_model
from RetrievalAugmentedGeneration.common.utils import get_config, get_llm, get_embedding_model, get_vectorstore_langchain

logger = logging.getLogger(__name__)
DOCS_DIR = os.path.abspath("./uploaded_files")
Expand All @@ -38,7 +38,6 @@
class NvidiaAIFoundation(BaseExample):
def ingest_docs(self, file_name: str, filename: str):
"""Ingest documents to the VectorDB."""

try:
# TODO: Load embedding created in older conversation, memory persistance
# We initialize class in every call therefore it should be global
Expand All @@ -54,8 +53,7 @@ def ingest_docs(self, file_name: str, filename: str):
if vectorstore:
vectorstore.add_documents(documents)
else:
vectorstore = FAISS.from_documents(documents, document_embedder)
logger.info("Vector store created and saved.")
vectorstore = get_vectorstore_langchain(documents, document_embedder)
else:
logger.warning("No documents available to process!")
except Exception as e:
Expand Down Expand Up @@ -106,8 +104,14 @@ def rag_chain(self, prompt: str, num_tokens: int) -> Generator[str, None, None]:

try:
if vectorstore != None:
retriever = vectorstore.as_retriever()
docs = retriever.get_relevant_documents(prompt)
try:
retriever = vectorstore.as_retriever(search_type="similarity_score_threshold", search_kwargs={"score_threshold": 0.25})
docs = retriever.get_relevant_documents(prompt)
except NotImplementedError:
# Some retriever like milvus don't have similarity score threshold implemented
retriever = vectorstore.as_retriever()
docs = retriever.get_relevant_documents(prompt)


context = ""
for doc in docs:
Expand All @@ -134,8 +138,14 @@ def document_search(self, content: str, num_docs: int) -> List[Dict[str, Any]]:

try:
if vectorstore != None:
retriever = vectorstore.as_retriever()
docs = retriever.get_relevant_documents(content)
try:
retriever = vectorstore.as_retriever(search_type="similarity_score_threshold", search_kwargs={"score_threshold": 0.25})
docs = retriever.get_relevant_documents(content)
except NotImplementedError:
# Some retriever like milvus don't have similarity score threshold implemented
retriever = vectorstore.as_retriever()
docs = retriever.get_relevant_documents(content)

result = []
for doc in docs:
result.append(
Expand Down
Loading

0 comments on commit 91d637a

Please sign in to comment.