mongodb-developer · sgsshankar · May 7, 2024 · May 7, 2024 · May 7, 2024 · May 7, 2024
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,175 @@
+### Python ###
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+atlas-rag/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+
+### Python Patch ###
+# Poetry local configuration file - https://python-poetry.org/docs/configuration/#local-configuration
+poetry.toml
+
+# ruff
+.ruff_cache/
+
+# LSP config files
+pyrightconfig.json
+
+# Param File
+key_param.py
diff --git a/README.md b/README.md
@@ -1,22 +1,46 @@
 # Atlas Vector Search with RAG
 
-The Python scripts in this repo use Atlas Vector Search with Retrieval-Augmented Generation (RAG) architecture to build a Question Answering application. They use the LangChain framework, OpenAI models, as well as Gradio in conjunction with Atlas Vector Search in a RAG architecture, to create this app.
+The Python scripts in this repo use Atlas Vector Search with Retrieval-Augmented Generation (RAG) architecture to build a Question Answering application. They use the LangChain framework, Azure OpenAI models, as well as Gradio in conjunction with Atlas Vector Search in a RAG architecture, to create this app.
 
 
 ## Setting up the Environment
 
-1. Install the following packages:
+1. Install Python Virtual Environment:
 ```
-pip3 install langchain pymongo bs4 openai tiktoken gradio requests lxml argparse unstructured
+pip3 install virtualenv
 ```
-2. Create OpenAI API Key from [here](https://platform.openai.com/account/api-keys). Note that this requires a paid account with OpenAI, with enough credits. OpenAI API requests stop working if credit balance reaches `$0`.
+2. Setup Python Virtual Environment:
+```
+python -m venv atlas-rag
+```
+3. Activate Virtual Environment and Install the dependencies:
+```
+pip3 install -r requirements.txt
+```
+4. For Azure OpenAI:
+Create Azure OpenAI Deployment and key by following [this](https://learn.microsoft.com/en-us/azure/ai-services/openai/how-to/create-resource?pivots=web-portal). Note that this requires a Azure Subscription
+
+5. Rename the `key_param_template.py` to `key_param.py` and update the file with the details
 
-3. Save the OpenAI API key and the MongoDB URI in the `key_param.py` file, like this:
+6. Create an Atlas Search index `default` by following [this](https://www.mongodb.com/docs/atlas/atlas-search/create-index/) using the JSON
 ```
-openai_api_key = "ENTER_OPENAI_API_KEY_HERE"
-MONGO_URI = "ENTER_MONGODB_URI_HERE"
+{
+  "mappings": {
+    "dynamic": false,
+    "fields": {
+      "embedding": [
+        {
+          "dimensions": 1536,
+          "similarity": "cosine",
+          "type": "knnVector"
+        }
+      ]
+    }
+  }
+}
 ```
-4. Use the following two python scripts:
+
+7. Use the following two python scripts:
    - **load_data.py**: This script will be used to load your documents and ingest the text and vector embeddings, in a MongoDB collection.
    - **extract_information.py**: This script will generate the user interface and will allow you to perform question-answering against your data, using Atlas Vector Search and OpenAI.
 
@@ -30,5 +54,5 @@ MONGO_URI = "ENTER_MONGODB_URI_HERE"
 | LangChain                                                                                                                  | OpenAI                                                                                                                           | Atlas Vector Search                                                                                                  | Gradio                                                     |
 |----------------------------------------------------------------------------------------------------------------------------|----------------------------------------------------------------------------------------------------------------------------------|-----------------------------------------------------------------------------------------------------------------------|------------------------------------------------------------|
 | [**DirectoryLoader**](https://api.python.langchain.com/en/latest/document_loaders/langchain.document_loaders.unstructured.UnstructuredFileLoader.html): <br> - All documents from a directory <br> - Split and load <br> - Uses the [Unstructured](https://python.langchain.com/docs/integrations/document_loaders/unstructured_file.html) package | **Embedding Model**: <br> - [text-embedding-ada-002](https://openai.com/blog/new-and-improved-embedding-model) <br> - Text → Vector embeddings <br> - 1536 dimensions           | [**Vector Store**](https://www.mongodb.com/docs/atlas/atlas-vector-search/vector-search-stage/)                             | [UI](https://www.gradio.app/) for LLM app <br> - Open-source Python library <br> - Allows to quickly create user interfaces for ML models |
-| [**RetrievalQA**](https://api.python.langchain.com/en/latest/chains/langchain.chains.retrieval_qa.base.BaseRetrievalQA.html?highlight=retrievalqa#langchain.chains.retrieval_qa.base.BaseRetrievalQA): <br> - Retriever <br> - Question-answering chain                       | **Language model**: <br> - [gpt-3.5-turbo](https://platform.openai.com/docs/models/gpt-3-5) <br> - Understands and generates natural language <br> - Generates text, answers, translations, etc.                                       |                                                                                                                           |                                                            |
+| [**RetrievalQA**](https://api.python.langchain.com/en/latest/chains/langchain.chains.retrieval_qa.base.BaseRetrievalQA.html?highlight=retrievalqa#langchain.chains.retrieval_qa.base.BaseRetrievalQA): <br> - Retriever <br> - Question-answering chain                       | **Language model**: <br> - [gpt-3.5-turbo](https://learn.microsoft.com/en-us/azure/ai-services/openai/concepts/models#gpt-35) <br> - Understands and generates natural language <br> - Generates text, answers, translations, etc.                                       |                                                                                                                           |                                                            |
 | [**MongoDBAtlasVectorSearch**](https://api.python.langchain.com/en/latest/vectorstores/langchain.vectorstores.mongodb_atlas.MongoDBAtlasVectorSearch.html): <br> - Wrapper around Atlas Vector Search <br> - Easily create and store embeddings in MongoDB collections <br> - Perform KNN Search using Atlas Vector Search          |                                                                                                                                                                                      |                                                                                                                           |                                                            |
diff --git a/extract_information.py b/extract_information.py
@@ -1,8 +1,7 @@
 from pymongo import MongoClient
-from langchain.embeddings.openai import OpenAIEmbeddings
-from langchain.vectorstores import MongoDBAtlasVectorSearch
-from langchain.document_loaders import DirectoryLoader
-from langchain.llms import OpenAI
+from langchain_openai import AzureOpenAIEmbeddings
+from langchain_community.vectorstores import MongoDBAtlasVectorSearch
+from langchain_openai import AzureChatOpenAI
 from langchain.chains import RetrievalQA
 import gradio as gr
 from gradio.themes.base import Base
@@ -11,45 +10,62 @@
 client = MongoClient(key_param.MONGO_URI)
 dbName = "langchain_demo"
 collectionName = "collection_of_text_blobs"
+searchIndexName ="default"
 collection = client[dbName][collectionName]
 
 # Define the text embedding model
-
-embeddings = OpenAIEmbeddings(openai_api_key=key_param.openai_api_key)
+embeddings = AzureOpenAIEmbeddings(deployment=key_param.AZURE_EMBEDDINGS_DEPLOYMENT, 
+                                    azure_endpoint = key_param.AZURE_OPENAI_ENDPOINT,
+                                    openai_api_version = key_param.AZURE_OPENAI_API_VERSION,
+                                    openai_api_key=key_param.AZURE_OPENAI_API_KEY,
+                                    show_progress_bar=True)
 
 # Initialize the Vector Store
 
-vectorStore = MongoDBAtlasVectorSearch( collection, embeddings )
+vectorStore = MongoDBAtlasVectorSearch.from_connection_string(key_param.MONGO_URI, f"{dbName}.{collectionName}", embeddings)
 
 def query_data(query):
     # Convert question to vector using OpenAI embeddings
     # Perform Atlas Vector Search using Langchain's vectorStore
-    # similarity_search returns MongoDB documents most similar to the query    
+    # similarity_search returns MongoDB documents most similar to the query
 
-    docs = vectorStore.similarity_search(query, K=1)
-    as_output = docs[0].page_content
+    # Get VectorStoreRetriever: Specifically, Retriever for MongoDB VectorStore.
+    # Implements _get_relevant_documents which retrieves documents relevant to a query.
+    retriever = vectorStore.as_retriever(search_type = "similarity", 
+                                         search_kwargs={"k": 1}, 
+                                         searchIndexName=searchIndexName)
 
     # Leveraging Atlas Vector Search paired with Langchain's QARetriever
+    docs = vectorStore.similarity_search(query, K=1)
+    as_output = docs[0].page_content
 
     # Define the LLM that we want to use -- note that this is the Language Generation Model and NOT an Embedding Model
     # If it's not specified (for example like in the code below),
     # then the default OpenAI model used in LangChain is OpenAI GPT-3.5-turbo, as of August 30, 2023
-
-    llm = OpenAI(openai_api_key=key_param.openai_api_key, temperature=0)
-
-
-    # Get VectorStoreRetriever: Specifically, Retriever for MongoDB VectorStore.
-    # Implements _get_relevant_documents which retrieves documents relevant to a query.
-    retriever = vectorStore.as_retriever()
+    # llm = AzureOpenAI(deployment_name=key_param.AZURE_LANGUAGE_DEPLOYMENT, 
+    #                   azure_endpoint = key_param.AZURE_OPENAI_ENDPOINT,
+    #                   openai_api_version = key_param.AZURE_OPENAI_API_VERSION,
+    #                   openai_api_key=key_param.AZURE_OPENAI_API_KEY, 
+    #                   temperature=0)
+
+    llm = AzureChatOpenAI(azure_deployment=key_param.AZURE_LANGUAGE_DEPLOYMENT, 
+                      azure_endpoint = key_param.AZURE_OPENAI_ENDPOINT,
+                      openai_api_version = key_param.AZURE_OPENAI_API_VERSION,
+                      openai_api_key=key_param.AZURE_OPENAI_API_KEY, 
+                      temperature=0)
 
     # Load "stuff" documents chain. Stuff documents chain takes a list of documents,
     # inserts them all into a prompt and passes that prompt to an LLM.
 
-    qa = RetrievalQA.from_chain_type(llm, chain_type="stuff", retriever=retriever)
-
+    qa = RetrievalQA.from_chain_type(llm, 
+                                     chain_type="stuff", 
+                                     retriever=retriever)
+
     # Execute the chain
 
-    retriever_output = qa.run(query)
+    retriever_output = qa.invoke(query)
+
+    retriever_output['result'] = retriever_output['result'].replace("\n"," ") 
 
     # Return Atlas Vector Search output, and output generated using RAG Architecture
     return as_output, retriever_output

diff --git a/key_param_template.py b/key_param_template.py
@@ -0,0 +1,6 @@
+MONGO_URI = "<MongoDB URI>"
+AZURE_OPENAI_API_VERSION = "2024-02-15-preview"
+AZURE_OPENAI_ENDPOINT = "https://<EndPoint Name>.azure.com/"
+AZURE_OPENAI_API_KEY = "<OpenAI Key>"
+AZURE_EMBEDDINGS_DEPLOYMENT = "<Deployment Name>"
+AZURE_LANGUAGE_DEPLOYMENT = "<Deployment Name>"
diff --git a/load_data.py b/load_data.py
@@ -1,15 +1,14 @@
 from pymongo import MongoClient
-from langchain.embeddings.openai import OpenAIEmbeddings
-from langchain.vectorstores import MongoDBAtlasVectorSearch
-from langchain.document_loaders import DirectoryLoader
-from langchain.llms import OpenAI
+from langchain_openai import AzureOpenAIEmbeddings
+from langchain_mongodb import MongoDBAtlasVectorSearch
+from langchain_community.document_loaders import DirectoryLoader
 import key_param
 
 # Set the MongoDB URI, DB, Collection Names
-
 client = MongoClient(key_param.MONGO_URI)
 dbName = "langchain_demo"
 collectionName = "collection_of_text_blobs"
+searchIndexName ="default"
 collection = client[dbName][collectionName]
 
 # Initialize the DirectoryLoader
@@ -18,8 +17,12 @@
 
 # Define the OpenAI Embedding Model we want to use for the source data
 # The embedding model is different from the language generation model
-embeddings = OpenAIEmbeddings(openai_api_key=key_param.openai_api_key)
+embeddings = AzureOpenAIEmbeddings(deployment=key_param.AZURE_EMBEDDINGS_DEPLOYMENT, 
+                                    azure_endpoint = key_param.AZURE_OPENAI_ENDPOINT,
+                                    openai_api_version = key_param.AZURE_OPENAI_API_VERSION,
+                                    openai_api_key=key_param.AZURE_OPENAI_API_KEY,
+                                    show_progress_bar=True)
 
 # Initialize the VectorStore, and
 # vectorise the text from the documents using the specified embedding model, and insert them into the specified MongoDB collection
-vectorStore = MongoDBAtlasVectorSearch.from_documents( data, embeddings, collection=collection )
+vectorStore = MongoDBAtlasVectorSearch.from_documents( data, embeddings, collection=collection, index_name=searchIndexName)
diff --git a/queries b/queries
@@ -0,0 +1,6 @@
+What did Alfred say?
+Did any error on August 16th and if yes what caused the error?
+What were Bruce's answers? Summarize in bullet points
+What is Airflow?
+What was the overall sentiment of Alfred's chat with Bruce?
+What was the likely CSAT?
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,14 @@
+langchain==0.1.16
+langchain-community==0.0.34
+langchain-openai==0.1.3
+langchain-mongodb==0.1.3
+lark==1.1.9
+pymongo==4.6.3
+bs4==0.0.2
+openai==1.23.2 
+tiktoken==0.6.0 
+gradio==4.29.0 
+requests==2.31.0 
+lxml==5.2.1 
+argparse 
+unstructured==0.13.3