Spaces:

ishandutta
/

multimodal-myntra-shoes-rag-pipeline

Runtime error

App Files Files Community

ishandutta commited on Jun 20

Commit

b02d315

verified ·

1 Parent(s): ee0ed55

Create rag_pipeline.py

Browse files

Files changed (1) hide show

rag_pipeline.py +545 -0

rag_pipeline.py ADDED Viewed

	@@ -0,0 +1,545 @@

+"""
+SCRIPT 4/5: rag_pipeline.py - Complete RAG Pipeline Integration for Shoe Search
+Colab - https://colab.research.google.com/drive/1rq-ywjykHBw7xPXCmd3DmZdK6T9bhDtA?usp=sharing
+This script integrates all three phases of the RAG pipeline:
+1. RETRIEVAL: Vector search and data management (from retriever.py)
+2. AUGMENTATION: Context enhancement and prompt engineering (from augmenter.py)
+3. GENERATION: LLM setup and response generation (from generator.py)
+Key Concepts:
+- RAG (Retrieval-Augmented Generation): A technique that combines information retrieval
+  with language generation to provide accurate, contextual responses
+- Pipeline Integration: Connecting multiple AI components in sequence
+- End-to-End Processing: Complete workflow from query to final response
+- Multi-modal Search: Supporting both text and image queries
+Required Dependencies:
+- All dependencies from retriever.py, augmenter.py, and generator.py
+Commands to run:
+# Complete RAG pipeline with text query
+python rag_pipeline.py --query "recommend running shoes for men"
+# Complete RAG pipeline with image query
+python rag_pipeline.py --query "hf_shoe_images/shoe_0000.jpg"
+# RAG pipeline with OpenAI model (Requires API key)
+python rag_pipeline.py --query "comfortable sneakers" --model-provider openai --openai-api-key YOUR_KEY
+# RAG pipeline with detailed step tracking
+python rag_pipeline.py --query "blue shoes" --detailed-steps
+# Setup database and run pipeline
+python rag_pipeline.py --setup-db --query "recommend me men's casual shoes"
+# Pipeline without LLM (retrieval only)
+python rag_pipeline.py --query "recommend me men's running shoes" --no-llm
+"""
+import argparse
+from typing import Any, Dict, List, Optional
+from openai import OpenAI
+from PIL import Image
+from augmenter import QueryType, SimpleShoePrompts
+from generator import (
+    generate_shoes_rag_response,
+    get_available_models,
+    setup_openai_client,
+    setup_qwen_model,
+)
+# Import components from other modules
+from retriever import MyntraShoesEnhanced, create_shoes_table_from_hf, run_shoes_search
+def run_complete_shoes_rag_pipeline(
+    database: str,
+    table_name: str,
+    schema: Any,
+    search_query: Any,  # Can be text string or image path/PIL Image
+    limit: int = 3,
+    use_llm: bool = True,
+    use_advanced_prompts: bool = True,
+    search_type: str = "auto",
+    model_provider: str = "qwen",
+    model_name: str = "Qwen/Qwen2.5-0.5B-Instruct",
+    openai_api_key: Optional[str] = None,
+) -> Dict[str, Any]:
+    """Run complete RAG pipeline integrating Retrieval, Augmentation, and Generation."""
+    # SECTION 1: RETRIEVAL - Get relevant shoes from vector database
+    print("🔍 RETRIEVAL: Searching for relevant shoes...")
+    results, actual_search_type = run_shoes_search(
+        database, table_name, schema, search_query, limit, search_type=search_type
+    )
+    if not results:
+        return {
+            "query": search_query,
+            "results": [],
+            "response": "No results found",
+            "search_type": actual_search_type,
+        }
+    if not use_llm:
+        return {
+            "query": search_query,
+            "results": results,
+            "response": None,
+            "search_type": actual_search_type,
+        }
+    # SECTION 2: AUGMENTATION - Process and enhance context with prompt engineering
+    try:
+        print("📝 AUGMENTATION: Enhancing context with prompt engineering...")
+        # Set up prompt manager and analyze query
+        prompt_manager = SimpleShoePrompts()
+        # For image search, use appropriate query text
+        if actual_search_type == "image":
+            query_text = "similar shoes based on the provided image"
+            print(f"   └─ Image search - using search query type")
+        else:
+            query_text = str(search_query)
+            query_type = prompt_manager.classify_query(query_text)
+            print(f"   └─ Text query classified as: {query_type.value}")
+        # Format context and generate enhanced prompt
+        enhanced_prompt = prompt_manager.generate_prompt(
+            query_text, results, actual_search_type
+        )
+        print(f"   └─ Context formatted with {len(results)} retrieved shoes")
+        # SECTION 3: GENERATION - Setup LLM and generate response
+        print("🤖 GENERATION: Setting up LLM and generating response...")
+        tokenizer, model, openai_client = None, None, None
+        if model_provider == "openai":
+            if not openai_api_key:
+                raise ValueError("OpenAI API key is required for OpenAI models")
+            openai_client = setup_openai_client(openai_api_key)
+            print(f"   └─ OpenAI client setup with model: {model_name}")
+        else:
+            tokenizer, model = setup_qwen_model(model_name)
+            print(f"   └─ Qwen model loaded: {model_name}")
+        # Generate final response using augmented context
+        response = generate_shoes_rag_response(
+            query=query_text,
+            retrieved_shoes=results,
+            model_provider=model_provider,
+            model_name=model_name,
+            openai_client=openai_client,
+            tokenizer=tokenizer,
+            model=model,
+            max_tokens=200,
+            use_advanced_prompts=use_advanced_prompts,
+        )
+        # Add prompt analysis
+        if actual_search_type == "image":
+            final_query_type = QueryType.SEARCH.value
+        else:
+            final_query_type = query_type.value
+        prompt_analysis = {
+            "query_type": final_query_type,
+            "num_results": len(results),
+            "search_type": actual_search_type,
+        }
+        return {
+            "query": search_query,
+            "results": results,
+            "response": response,
+            "prompt_analysis": prompt_analysis,
+            "search_type": actual_search_type,
+        }
+    except Exception as e:
+        print(f"LLM generation failed: {e}")
+        return {
+            "query": search_query,
+            "results": results,
+            "response": "LLM unavailable - showing search results only",
+            "search_type": actual_search_type,
+        }
+def run_complete_shoes_rag_pipeline_with_details(
+    database: str,
+    table_name: str,
+    schema: Any,
+    search_query: Any,  # Can be text string or image path/PIL Image
+    limit: int = 3,
+    use_llm: bool = True,
+    use_advanced_prompts: bool = True,
+    search_type: str = "auto",
+    model_provider: str = "qwen",
+    model_name: str = "Qwen/Qwen2.5-0.5B-Instruct",
+    openai_api_key: Optional[str] = None,
+) -> Dict[str, Any]:
+    """Run complete RAG pipeline with detailed step tracking."""
+    # Initialize step details
+    retrieval_details = ""
+    augmentation_details = ""
+    generation_details = ""
+    # SECTION 1: RETRIEVAL - Get relevant shoes from vector database
+    retrieval_details += "🔍 RETRIEVAL PHASE\n"
+    retrieval_details += "=" * 50 + "\n"
+    retrieval_details += f"🎯 Query Type: {search_type}\n"
+    retrieval_details += f"🔍 Searching vector database...\n"
+    results, actual_search_type = run_shoes_search(
+        database, table_name, schema, search_query, limit, search_type=search_type
+    )
+    retrieval_details += f"✅ Search completed!\n"
+    retrieval_details += f"📊 Search Type Detected: {actual_search_type}\n"
+    retrieval_details += f"📈 Results Found: {len(results)}\n\n"
+    if results:
+        retrieval_details += "🎯 Retrieved Products:\n"
+        for i, result in enumerate(results, 1):
+            retrieval_details += f"  {i}. {result.get('product_type', 'Shoe')} for {result.get('gender', 'Unisex')}\n"
+            retrieval_details += f"     Color: {result.get('color', 'N/A')}\n"
+            retrieval_details += f"     Pattern: {result.get('pattern', 'N/A')}\n"
+            if result.get("description"):
+                # Show full description without truncation
+                retrieval_details += f"     Description: {result['description']}\n"
+            retrieval_details += "\n"
+    else:
+        retrieval_details += "❌ No results found\n"
+        return {
+            "query": search_query,
+            "results": [],
+            "response": "No results found",
+            "search_type": actual_search_type,
+            "retrieval_details": retrieval_details,
+            "augmentation_details": "⏭️ Skipped - No results to process",
+            "generation_details": "⏭️ Skipped - No results to process",
+        }
+    if not use_llm:
+        return {
+            "query": search_query,
+            "results": results,
+            "response": None,
+            "search_type": actual_search_type,
+            "retrieval_details": retrieval_details,
+            "augmentation_details": "⏭️ Skipped - LLM disabled",
+            "generation_details": "⏭️ Skipped - LLM disabled",
+        }
+    # SECTION 2: AUGMENTATION - Process and enhance context with prompt engineering
+    try:
+        augmentation_details += "📝 AUGMENTATION PHASE\n"
+        augmentation_details += "=" * 50 + "\n"
+        # Set up prompt manager and analyze query
+        prompt_manager = SimpleShoePrompts()
+        # For image search, use appropriate query text
+        if actual_search_type == "image":
+            query_text = "similar shoes based on the provided image"
+            augmentation_details += f"🖼️ Image Search Detected\n"
+            augmentation_details += f"🔄 Query Text: '{query_text}'\n"
+        else:
+            query_text = str(search_query)
+            query_type = prompt_manager.classify_query(query_text)
+            augmentation_details += f"📝 Text Query: '{query_text}'\n"
+            augmentation_details += f"🎯 Query Classification: {query_type.value}\n"
+        # Format context and generate enhanced prompt
+        enhanced_prompt = prompt_manager.generate_prompt(
+            query_text, results, actual_search_type
+        )
+        augmentation_details += f"📊 Context Processing:\n"
+        augmentation_details += f"  • Products formatted: {len(results)}\n"
+        augmentation_details += (
+            f"  • Prompt strategy: {'Advanced' if use_advanced_prompts else 'Basic'}\n"
+        )
+        augmentation_details += (
+            f"  • Prompt length: {len(enhanced_prompt)} characters\n\n"
+        )
+        # Show the full prompt instead of preview
+        augmentation_details += f"🔍 Full Prompt:\n{enhanced_prompt}\n\n"
+        # SECTION 3: GENERATION - Setup LLM and generate response
+        generation_details += "🤖 GENERATION PHASE\n"
+        generation_details += "=" * 50 + "\n"
+        generation_details += f"🏭 Model Provider: {model_provider}\n"
+        generation_details += f"🎯 Model Name: {model_name}\n"
+        tokenizer, model, openai_client = None, None, None
+        if model_provider == "openai":
+            if not openai_api_key:
+                raise ValueError("OpenAI API key is required for OpenAI models")
+            openai_client = setup_openai_client(openai_api_key)
+            generation_details += f"✅ OpenAI client initialized\n"
+            generation_details += f"🔑 API key: {'*' * (len(openai_api_key) - 8) + openai_api_key[-4:] if len(openai_api_key) > 8 else '****'}\n"
+        else:
+            tokenizer, model = setup_qwen_model(model_name)
+            generation_details += f"✅ Qwen model loaded\n"
+            generation_details += f"💾 Model size: {model_name}\n"
+        generation_details += f"⚙️ Generation settings:\n"
+        generation_details += f"  • Max tokens: 200\n"
+        generation_details += f"  • Temperature: 0.1 (low for consistency)\n"
+        generation_details += f"  • Advanced prompts: {use_advanced_prompts}\n\n"
+        generation_details += f"🔄 Generating response...\n"
+        # Generate final response using augmented context
+        response = generate_shoes_rag_response(
+            query=query_text,
+            retrieved_shoes=results,
+            model_provider=model_provider,
+            model_name=model_name,
+            openai_client=openai_client,
+            tokenizer=tokenizer,
+            model=model,
+            max_tokens=200,
+            use_advanced_prompts=use_advanced_prompts,
+        )
+        generation_details += f"✅ Response generated!\n"
+        generation_details += f"📏 Response length: {len(response)} characters\n"
+        generation_details += f"📝 Full Response:\n{response}\n"
+        # Add prompt analysis
+        if actual_search_type == "image":
+            final_query_type = QueryType.SEARCH.value
+        else:
+            final_query_type = query_type.value
+        prompt_analysis = {
+            "query_type": final_query_type,
+            "num_results": len(results),
+            "search_type": actual_search_type,
+        }
+        return {
+            "query": search_query,
+            "results": results,
+            "response": response,
+            "prompt_analysis": prompt_analysis,
+            "search_type": actual_search_type,
+            "retrieval_details": retrieval_details,
+            "augmentation_details": augmentation_details,
+            "generation_details": generation_details,
+        }
+    except Exception as e:
+        error_msg = f"❌ LLM generation failed: {str(e)}"
+        generation_details += error_msg
+        return {
+            "query": search_query,
+            "results": results,
+            "response": "LLM unavailable - showing search results only",
+            "search_type": actual_search_type,
+            "retrieval_details": retrieval_details,
+            "augmentation_details": augmentation_details,
+            "generation_details": generation_details,
+        }
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Complete RAG Pipeline for Shoe Search"
+    )
+    parser.add_argument(
+        "--query", type=str, help="Search query (text) or image file path"
+    )
+    parser.add_argument(
+        "--search-type",
+        choices=["auto", "text", "image"],
+        default="auto",
+        help="Force search type (default: auto-detect)",
+    )
+    parser.add_argument(
+        "--limit", type=int, default=3, help="Number of results to retrieve"
+    )
+    parser.add_argument(
+        "--database", type=str, default="myntra_shoes_db", help="Database path"
+    )
+    parser.add_argument(
+        "--table-name", type=str, default="myntra_shoes_table", help="Table name"
+    )
+    # Model configuration
+    parser.add_argument(
+        "--model-provider",
+        choices=["qwen", "openai"],
+        default="qwen",
+        help="Model provider to use",
+    )
+    parser.add_argument("--model-name", type=str, help="Model name to use")
+    parser.add_argument(
+        "--openai-api-key", type=str, help="OpenAI API key (required for OpenAI models)"
+    )
+    parser.add_argument(
+        "--use-advanced-prompts",
+        action="store_true",
+        default=True,
+        help="Use advanced prompt engineering",
+    )
+    parser.add_argument(
+        "--basic-prompts",
+        action="store_true",
+        help="Use basic prompts instead of advanced",
+    )
+    # Pipeline options
+    parser.add_argument(
+        "--no-llm", action="store_true", help="Run retrieval only, skip LLM generation"
+    )
+    parser.add_argument(
+        "--detailed-steps",
+        action="store_true",
+        help="Show detailed step-by-step breakdown",
+    )
+    parser.add_argument(
+        "--setup-db",
+        action="store_true",
+        help="Setup database from HuggingFace dataset",
+    )
+    parser.add_argument(
+        "--sample-size", type=int, default=500, help="Sample size for database setup"
+    )
+    args = parser.parse_args()
+    # Setup database if requested
+    if args.setup_db:
+        print("🔄 Setting up database from HuggingFace dataset...")
+        create_shoes_table_from_hf(
+            database=args.database,
+            table_name=args.table_name,
+            sample_size=args.sample_size,
+            save_images=True,
+        )
+        print("✅ Database setup complete!")
+        if not args.query:
+            exit(0)
+    # Validate query
+    if not args.query:
+        print("❌ Please provide a query using --query")
+        print("\nExample usage:")
+        print("  # Setup database first")
+        print("  python rag_pipeline.py --setup-db")
+        print("  # Complete RAG pipeline with text query")
+        print("  python rag_pipeline.py --query 'recommend running shoes for men'")
+        print("  # RAG pipeline with image query")
+        print("  python rag_pipeline.py --query 'path/to/shoe.jpg' --search-type image")
+        print("  # RAG pipeline with OpenAI")
+        print(
+            "  python rag_pipeline.py --query 'comfortable sneakers' --model-provider openai --openai-api-key YOUR_KEY"
+        )
+        print("  # Detailed step tracking")
+        print("  python rag_pipeline.py --query 'blue shoes' --detailed-steps")
+        exit(1)
+    # Set default model names based on provider
+    available_models = get_available_models()
+    if not args.model_name:
+        args.model_name = available_models[args.model_provider][0]
+    # Handle basic prompts flag
+    use_advanced_prompts = (
+        not args.basic_prompts if args.basic_prompts else args.use_advanced_prompts
+    )
+    # Validate OpenAI setup
+    if args.model_provider == "openai":
+        if not args.openai_api_key:
+            print(
+                "❌ OpenAI API key is required for OpenAI models. Use --openai-api-key"
+            )
+            exit(1)
+    print("🚀 Starting Complete RAG Pipeline...")
+    print("=" * 60)
+    print(f"Query: {args.query}")
+    print(f"Search Type: {args.search_type}")
+    print(f"Model Provider: {args.model_provider}")
+    print(f"Model Name: {args.model_name}")
+    print(f"Use LLM: {not args.no_llm}")
+    print(f"Advanced Prompts: {use_advanced_prompts}")
+    # Run pipeline
+    if args.detailed_steps:
+        rag_result = run_complete_shoes_rag_pipeline_with_details(
+            database=args.database,
+            table_name=args.table_name,
+            schema=MyntraShoesEnhanced,
+            search_query=args.query,
+            limit=args.limit,
+            use_llm=not args.no_llm,
+            use_advanced_prompts=use_advanced_prompts,
+            search_type=args.search_type,
+            model_provider=args.model_provider,
+            model_name=args.model_name,
+            openai_api_key=args.openai_api_key,
+        )
+        # Display detailed results
+        print("\n" + "=" * 60)
+        print("📊 RAG PIPELINE DETAILED RESULTS")
+        print("=" * 60)
+        print("\n" + rag_result.get("retrieval_details", "No retrieval details"))
+        print("\n" + rag_result.get("augmentation_details", "No augmentation details"))
+        print("\n" + rag_result.get("generation_details", "No generation details"))
+    else:
+        rag_result = run_complete_shoes_rag_pipeline(
+            database=args.database,
+            table_name=args.table_name,
+            schema=MyntraShoesEnhanced,
+            search_query=args.query,
+            limit=args.limit,
+            use_llm=not args.no_llm,
+            use_advanced_prompts=use_advanced_prompts,
+            search_type=args.search_type,
+            model_provider=args.model_provider,
+            model_name=args.model_name,
+            openai_api_key=args.openai_api_key,
+        )
+    # Display results
+    print("\n" + "=" * 60)
+    print("📊 RAG PIPELINE RESULTS")
+    print("=" * 60)
+    print(f"Query: {rag_result['query']}")
+    print(f"Search Type: {rag_result['search_type']}")
+    if rag_result.get("prompt_analysis"):
+        print(f"Query Type: {rag_result['prompt_analysis']['query_type']}")
+        print(f"Results Found: {rag_result['prompt_analysis']['num_results']}")
+    if rag_result.get("response"):
+        print(f"\n💬 RAG Response:")
+        print(rag_result["response"])
+    print(f"\n👟 Retrieved Shoes:")
+    for result in rag_result["results"]:
+        print(
+            f"- {result['product_type']} ({result['gender']}) - {result['color']} - {result['pattern']}"
+        )
+        if rag_result["search_type"] == "image":
+            print(f"  📁 Image saved: {result['image_path']}")
+    if rag_result["search_type"] == "image":
+        print(f"\n🖼️  Search results images saved in: shoe_search_output/")
+    print("\n✅ RAG Pipeline Complete!")