diff --git a/extract_information.py b/extract_information.py index 2aacead..1a0616f 100644 --- a/extract_information.py +++ b/extract_information.py @@ -1,9 +1,9 @@ from pymongo import MongoClient -from langchain.embeddings.openai import OpenAIEmbeddings -from langchain.vectorstores import MongoDBAtlasVectorSearch -from langchain.document_loaders import DirectoryLoader -from langchain.llms import OpenAI from langchain.chains import RetrievalQA +from langchain_openai import AzureOpenAIEmbeddings +from langchain_mongodb import MongoDBAtlasVectorSearch +from langchain_community.document_loaders import DirectoryLoader +from langchain_openai import AzureChatOpenAI import gradio as gr from gradio.themes.base import Base import key_param @@ -15,11 +15,11 @@ # Define the text embedding model -embeddings = OpenAIEmbeddings(openai_api_key=key_param.openai_api_key) +embeddings = AzureOpenAIEmbeddings(azure_deployment="text-embedding-ada-002", openai_api_version="2023-05-15") # Initialize the Vector Store -vectorStore = MongoDBAtlasVectorSearch( collection, embeddings ) +vectorStore = MongoDBAtlasVectorSearch( collection=collection, embedding=embeddings, index_name="vector_index" ) def query_data(query): # Convert question to vector using OpenAI embeddings @@ -27,7 +27,9 @@ def query_data(query): # similarity_search returns MongoDB documents most similar to the query docs = vectorStore.similarity_search(query, K=1) - as_output = docs[0].page_content + as_output="" + if docs: + as_output = docs[0].page_content # Leveraging Atlas Vector Search paired with Langchain's QARetriever @@ -35,7 +37,7 @@ def query_data(query): # If it's not specified (for example like in the code below), # then the default OpenAI model used in LangChain is OpenAI GPT-3.5-turbo, as of August 30, 2023 - llm = OpenAI(openai_api_key=key_param.openai_api_key, temperature=0) + llm = AzureChatOpenAI(azure_deployment="gpt-35-turbo", api_version="2024-02-01", temperature=0, max_tokens=None, timeout=None, max_retries=2) # Get VectorStoreRetriever: Specifically, Retriever for MongoDB VectorStore. @@ -49,10 +51,10 @@ def query_data(query): # Execute the chain - retriever_output = qa.run(query) + retriever_output = qa.invoke(query) # Return Atlas Vector Search output, and output generated using RAG Architecture - return as_output, retriever_output + return as_output, retriever_output["result"] # Create a web interface for the app, using Gradio diff --git a/load_data.py b/load_data.py index 4a36946..69bd19d 100644 --- a/load_data.py +++ b/load_data.py @@ -1,8 +1,9 @@ from pymongo import MongoClient -from langchain.embeddings.openai import OpenAIEmbeddings -from langchain.vectorstores import MongoDBAtlasVectorSearch -from langchain.document_loaders import DirectoryLoader -from langchain.llms import OpenAI +from langchain_openai import AzureOpenAIEmbeddings +from langchain_mongodb import MongoDBAtlasVectorSearch +from langchain_community.document_loaders import DirectoryLoader +from langchain_community.document_loaders import DirectoryLoader +from langchain_community.llms import OpenAI import key_param # Set the MongoDB URI, DB, Collection Names @@ -22,4 +23,4 @@ # Initialize the VectorStore, and # vectorise the text from the documents using the specified embedding model, and insert them into the specified MongoDB collection -vectorStore = MongoDBAtlasVectorSearch.from_documents( data, embeddings, collection=collection ) \ No newline at end of file +vectorStore = MongoDBAtlasVectorSearch.from_documents( data, embeddings, collection=collection ) diff --git a/sample_files/aerodynamics.txt b/sample_files/aerodynamics.txt new file mode 100644 index 0000000..59c03a8 --- /dev/null +++ b/sample_files/aerodynamics.txt @@ -0,0 +1 @@ +Boundary layer control, achieved using suction or blowing methods, can significantly reduce the aerodynamic drag on an aircraft's wing surface.The yaw angle of an aircraft, indicative of its side-to-side motion, is crucial for stability and is controlled primarily by the rudder.With advancements in computational fluid dynamics (CFD), engineers can accurately predict the turbulent airflow patterns around complex aircraft geometries, optimizing their design for better performance. diff --git a/sample_files/chat_conversation.txt b/sample_files/chat_conversation.txt new file mode 100644 index 0000000..6a59c76 --- /dev/null +++ b/sample_files/chat_conversation.txt @@ -0,0 +1 @@ +Alfred: Hi, can you explain to me how compression works in MongoDB? Bruce: Sure! MongoDB supports compression of data at rest. It uses either zlib or snappy compression algorithms at the collection level. When data is written, MongoDB compresses and stores it compressed. When data is read, MongoDB uncompresses it before returning it. Compression reduces storage space requirements. Alfred: Interesting, that's helpful to know. Can you also tell me how indexes are stored in MongoDB? Bruce: MongoDB indexes are stored in B-trees. The internal nodes of the B-trees contain keys that point to children nodes or leaf nodes. The leaf nodes contain references to the actual documents stored in the collection. Indexes are stored in memory and also written to disk. The in-memory B-trees provide fast access for queries using the index.Alfred: Ok that makes sense. Does MongoDB compress the indexes as well?Bruce: Yes, MongoDB also compresses the index data using prefix compression. This compresses common prefixes in the index keys to save space. However, the compression is lightweight and focused on performance vs storage space. Index compression is enabled by default.Alfred: Great, that's really helpful context on how indexes are handled. One last question - when I query on a non-indexed field, how does MongoDB actually perform the scanning?Bruce: MongoDB performs a collection scan if a query does not use an index. It will scan every document in the collection in memory and on disk to select the documents that match the query. This can be resource intensive for large collections without indexes, so indexing improves query performance.Alfred: Thank you for the detailed explanations Bruce, I really appreciate you taking the time to walk through how compression and indexes work under the hood in MongoDB. Very helpful!Bruce: You're very welcome! I'm glad I could explain the technical details clearly. Feel free to reach out if you have any other MongoDB questions. diff --git a/sample_files/log_example.txt b/sample_files/log_example.txt new file mode 100644 index 0000000..7c1daea --- /dev/null +++ b/sample_files/log_example.txt @@ -0,0 +1 @@ +2023-08-16T16:43:06.537+0000 I MONGOT [63528f5c2c4f78275d37902d-f5-u6-a0 BufferlessChangeStreamApplier] [63528f5c2c4f78275d37902d-f5-u6-a0 BufferlessChangeStreamApplier] Starting change stream from opTime=Timestamp{value=7267960339944178238, seconds=1692203884, inc=574}2023-08-16T16:43:06.543+0000 W MONGOT [63528f5c2c4f78275d37902d-f5-u6-a0 BufferlessChangeStreamApplier] [c.x.m.r.m.common.SchedulerQueue] cancelling queue batches for 63528f5c2c4f78275d37902d-f5-u6-a02023-08-16T16:43:06.544+0000 E MONGOT [63528f5c2c4f78275d37902d-f5-u6-a0 InitialSyncManager] [BufferlessInitialSyncManager 63528f5c2c4f78275d37902d-f5-u6-a0] Caught exception waiting for change stream events to be applied. Shutting down.com.xgen.mongot.replication.mongodb.common.InitialSyncException: com.mongodb.MongoCommandException: Command failed with error 286 (ChangeStreamHistoryLost): 'Executor error during getMore :: caused by :: Resume of change stream was not possible, as the resume point may no longer be in the oplog.' on server atlas-6keegs-shard-00-01.4bvxy.mongodb.net:27017.2023-08-16T16:43:06.545+0000 I MONGOT [indexing-lifecycle-3] [63528f5c2c4f78275d37902d-f5-u6-a0 ReplicationIndexManager] Transitioning from INITIAL_SYNC to INITIAL_SYNC_BACKOFF.2023-08-16T16:43:18.068+0000 I MONGOT [config-monitor] [c.x.m.config.provider.mms.ConfCaller] Conf call response has not changed. Last update date: 2023-08-16T16:43:18Z.2023-08-16T16:43:36.545+0000 I MONGOT [indexing-lifecycle-2] [63528f5c2c4f78275d37902d-f5-u6-a0 ReplicationIndexManager] Transitioning from INITIAL_SYNC_BACKOFF to INITIAL_SYNC.