Spaces:

hfmrbean
/

Barbie-Oppenheimer-LlamaIndex-RAQA-Tool

Runtime error

App Files Files Community

raul-padua commited on Aug 27, 2023

Commit

a2c1b0b

1 Parent(s): 9227993

Update app.py

Browse files

Files changed (1) hide show

app.py +154 -99

app.py CHANGED Viewed

@@ -1,110 +1,165 @@
-import chainlit as cl
-from langchain.embeddings.openai import OpenAIEmbeddings
-from langchain.document_loaders.csv_loader import CSVLoader
-from langchain.embeddings import CacheBackedEmbeddings
-from langchain.text_splitter import RecursiveCharacterTextSplitter
-from langchain.vectorstores import FAISS
-from langchain.chains import RetrievalQA
-from langchain.chat_models import ChatOpenAI
-from langchain.storage import LocalFileStore
-from langchain.prompts.chat import (
-    ChatPromptTemplate,
-    SystemMessagePromptTemplate,
-    HumanMessagePromptTemplate,
 )
-import chainlit as cl
-text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
-system_template = """
-Use the following pieces of context to answer the user's question.
-Please respond as if you were Ken from the movie Barbie. Ken is a well-meaning but naive character who loves to Beach. He talks like a typical Californian Beach Bro, but he doesn't use the word "Dude" so much.
-If you don't know the answer, just say that you don't know, don't try to make up an answer.
-You can make inferences based on the context as long as it still faithfully represents the feedback.
-Example of your response should be:
-```
-The answer is foo
-```
-Begin!
-----------------
-{context}"""
-messages = [
-    SystemMessagePromptTemplate.from_template(system_template),
-    HumanMessagePromptTemplate.from_template("{question}"),
-]
-prompt = ChatPromptTemplate(messages=messages)
-chain_type_kwargs = {"prompt": prompt}
-@cl.author_rename
-def rename(orig_author: str):
-    rename_dict = {"RetrievalQA": "Consulting The Kens"}
-    return rename_dict.get(orig_author, orig_author)
-@cl.on_chat_start
-async def init():
-    msg = cl.Message(content=f"Building Index...")
-    await msg.send()
-    # build FAISS index from csv
-    loader = CSVLoader(file_path="./data/barbie.csv", source_column="Review_Url")
-    data = loader.load()
-    documents = text_splitter.transform_documents(data)
-    store = LocalFileStore("./cache/")
-    core_embeddings_model = OpenAIEmbeddings()
-    embedder = CacheBackedEmbeddings.from_bytes_store(
-        core_embeddings_model, store, namespace=core_embeddings_model.model
     )
-    # make async docsearch
-    docsearch = await cl.make_async(FAISS.from_documents)(documents, embedder)
-    chain = RetrievalQA.from_chain_type(
-        ChatOpenAI(model="gpt-4", temperature=0, streaming=True),
-        chain_type="stuff",
-        return_source_documents=True,
-        retriever=docsearch.as_retriever(),
-        chain_type_kwargs = {"prompt": prompt}
     )
-    msg.content = f"Index built!"
-    await msg.send()
-    cl.user_session.set("chain", chain)
-@cl.on_message
-async def main(message):
-    chain = cl.user_session.get("chain")
-    cb = cl.AsyncLangchainCallbackHandler(
-        stream_final_answer=False, answer_prefix_tokens=["FINAL", "ANSWER"]
     )
-    cb.answer_reached = True
-    res = await chain.acall(message, callbacks=[cb], )
-    answer = res["result"]
-    source_elements = []
-    visited_sources = set()
-    # Get the documents from the user session
-    docs = res["source_documents"]
-    metadatas = [doc.metadata for doc in docs]
-    all_sources = [m["source"] for m in metadatas]
-    for source in all_sources:
-        if source in visited_sources:
-            continue
-        visited_sources.add(source)
-        # Create the text element referenced in the message
-        source_elements.append(
-            cl.Text(content="https://www.imdb.com" + source, name="Review URL")
-        )
-    if source_elements:
-        answer += f"\nSources: {', '.join([e.content.decode('utf-8') for e in source_elements])}"
     else:
-        answer += "\nNo sources found"
-    await cl.Message(content=answer, elements=source_elements).send()

+from llama_index import ServiceContext, SimpleNodeParser, TokenTextSplitter, OpenAI, OpenAIEmbedding
+from llama_index.vector_stores import ChromaVectorStore
+from llama_index.storage.storage_context import StorageContext
+from llama_index import VectorStoreIndex, WikipediaReader
+from llama_index.tools import FunctionTool
+from llama_index.vector_stores.types import VectorStoreInfo, MetadataInfo, ExactMatchFilter, MetadataFilters
+from llama_index.retrievers import VectorIndexRetriever
+from llama_index.query_engine import RetrieverQueryEngine
+from typing import List, Tuple, Any
+from pydantic import BaseModel, Field
+import chromadb
+import pandas as pd
+from sqlalchemy import create_engine
+from llama_index import SQLDatabase, NLSQLTableQueryEngine, QueryEngineTool
+from llama_index.openai_agent import OpenAIAgent
+from chainlit import ChainLit
+# Embedding Model and Low-level model
+embed_model = OpenAIEmbedding()
+chunk_size = 1000
+chunk_overlap = 100
+llm = OpenAI(
+    temperature=0,
+    model="gpt-4-32k",
+    streaming=True
 )
+service_context = ServiceContext.from_defaults(
+    llm=llm,
+    chunk_size=chunk_size,
+    embed_model=embed_model
+)
+text_splitter = TokenTextSplitter(
+    chunk_size=chunk_size,
+    chunk_overlap=chunk_overlap
+)
+node_parser = SimpleNodeParser(text_splitter=text_splitter)
+# Vector Storage and Context
+chroma_client = chromadb.Client()
+chroma_collection = chroma_client.create_collection("wikipedia_barbie_opp")
+vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
+storage_context = StorageContext.from_defaults(vector_store=vector_store)
+# Your Wikipedia docs retrieval
+movie_list = ["Barbie (film)", "Oppenheimer (film)"]
+wiki_docs = WikipediaReader().load_data(pages=movie_list, auto_suggest=False)
+# Parsing and storing vectors
+wiki_vector_index = VectorStoreIndex([], storage_context=storage_context, service_context=service_context)
+for movie, wiki_doc in zip(movie_list, wiki_docs):
+    nodes = node_parser.get_nodes_from_documents([wiki_doc])
+    for node in nodes:
+        node.metadata = {"title": movie}
+    wiki_vector_index.insert_nodes(nodes)
+# Defining the tools for vector search and SQL query
+top_k = 3
+vector_store_info = VectorStoreInfo(
+    content_info="semantic information about movies",
+    metadata_info=[MetadataInfo(
+        name="title",
+        type="str",
+        description="title of the movie, one of [Barbie (film), Oppenheimer (film)]",
+    )]
+)
+# Create PyDantic model for auto retrieval
+class AutoRetrieveModel(BaseModel):
+    query: str = Field(..., description="natural language query string")
+    filter_key_list: List[str] = Field(
+        ..., description="List of metadata filter field names"
     )
+    filter_value_list: List[str] = Field(
+        ...,
+        description=(
+            "List of metadata filter field values (corresponding to names specified in filter_key_list)"
+        )
     )
+def auto_retrieve_fn(query: str, filter_key_list: List[str], filter_value_list: List[str]):
+    exact_match_filters = [
+        ExactMatchFilter(key=k, value=v)
+        for k, v in zip(filter_key_list, filter_value_list)
+    ]
+    retriever = VectorIndexRetriever(
+        wiki_vector_index, filters=MetadataFilters(filters=exact_match_filters), top_k=top_k
+    )
+    query_engine = RetrieverQueryEngine.from_args(retriever)
+    response = query_engine.query(query)
+    return str(response)
+description = f"""\
+Use this tool to look up semantic information about films.
+The vector database schema is given below:
+{vector_store_info.json()}
+"""
+auto_retrieve_tool = FunctionTool.from_defaults(
+    fn=auto_retrieve_fn,
+    name="Auto_Retriever",
+    description=description,
+    fn_schema=AutoRetrieveModel
+)
+# SQL setup and tool definition
+barbie_df = pd.read_csv("barbie_data/barbie.csv")
+oppenheimer_df = pd.read_csv("oppenheimer_data/oppenheimer.csv")
+engine = create_engine("sqlite+pysqlite:///:memory:")
+barbie_df.to_sql(name='barbie', con=engine)
+oppenheimer_df.to_sql(name='oppenheimer', con=engine)
+sql_database = SQLDatabase(
+    engine=engine,
+    include_tables=['barbie', 'oppenheimer']
+)
+sql_query_engine = NLSQLTableQueryEngine(
+    sql_database=sql_database,
+    tables=['barbie', 'oppenheimer']
+)
+sql_tool = QueryEngineTool.from_defaults(
+    query_engine=sql_query_engine,
+    name="Natural_Language_to_SQL_Tool",
+    description=(
+        "Useful for translating a natural language query into a SQL query."
     )
+)
+# Combining both tools into a single OpenAI Agent
+barbenheimer_agent = OpenAIAgent.from_tools(
+    tools=[auto_retrieve_tool, sql_tool]
+)
+# Initialize the ChainLit app
+cl = ChainLit()
+# On-Message Function
+@cl.on_message
+def handle_message(message: str, sender: str) -> Tuple[str, Any]:
+    query_result = barbenheimer_agent.query(
+        query=message,
+        user_id=sender
+    )
+    # Extracting relevant information from the query result
+    tool_name = query_result.tool_name
+    response = query_result.response
+    if tool_name == "Auto_Retriever":
+        # Processing for semantic information retrieval
+        return "Auto_Retriever", f"Semantic Information:\n{response}"
+    elif tool_name == "Natural_Language_to_SQL_Tool":
+        # Processing for SQL-based information
+        return "Natural_Language_to_SQL_Tool", f"SQL Query Result:\n{response}"
     else:
+        # Handling unrecognized tool queries
+        return "Unknown", "I couldn't understand your request."
+# Running the app
+if __name__ == '__main__':
+    cl.run()