Shreyas94 commited on
Commit
8675311
·
verified ·
1 Parent(s): cf5225f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +64 -17
app.py CHANGED
@@ -594,16 +594,16 @@ class LLMSummarizer:
594
  return """You are an expert research assistant. Your task is to analyze search results and provide a comprehensive, accurate summary that directly answers the user's query.
595
 
596
  CRITICAL INSTRUCTIONS:
597
- 1. ONLY use information that is directly relevant to the user's query
598
- 2. If the search results don't contain relevant information, explicitly state this
599
- 3. Don't make up information or provide generic advice
600
- 4. Synthesize information from multiple sources when available
601
  5. Include specific facts, dates, numbers, and quotes when present
602
  6. If information is contradictory between sources, mention this
603
  7. Cite sources by mentioning the publication or website name
604
- 8. Be specific and detailed rather than vague
605
 
606
- If the search results are not relevant to the query, respond with: "The search results do not contain sufficient relevant information to answer your query about [topic]. The results primarily contained [brief description of what was actually found]."
607
 
608
  Format your response as a comprehensive summary, not bullet points."""
609
 
@@ -612,10 +612,35 @@ Format your response as a comprehensive summary, not bullet points."""
612
  valid_results = []
613
  validation_info = []
614
 
615
- query_keywords = set(query.lower().split())
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
616
 
617
  for result in search_results:
618
- if not result.content or len(result.content.strip()) < 100:
619
  validation_info.append(f"Skipped '{result.title}' - insufficient content")
620
  continue
621
 
@@ -623,23 +648,45 @@ Format your response as a comprehensive summary, not bullet points."""
623
  content_lower = result.content.lower()
624
  title_lower = result.title.lower()
625
  snippet_lower = result.snippet.lower()
 
626
 
627
- # Count relevant keywords
628
  relevant_score = 0
629
- for keyword in query_keywords:
630
- if len(keyword) > 2: # Skip very short words
 
 
631
  if keyword in content_lower:
632
  relevant_score += 2
 
633
  elif keyword in title_lower:
634
- relevant_score += 1
 
635
  elif keyword in snippet_lower:
636
- relevant_score += 0.5
 
637
 
638
- if relevant_score > 0:
 
 
 
 
 
 
 
639
  valid_results.append(result)
640
- validation_info.append(f"✓ '{result.title}' - relevance score: {relevant_score}")
641
  else:
642
- validation_info.append(f"Skipped '{result.title}' - not relevant to query")
 
 
 
 
 
 
 
 
 
643
 
644
  validation_summary = "\n".join(validation_info)
645
  return valid_results, validation_summary
@@ -655,7 +702,7 @@ Format your response as a comprehensive summary, not bullet points."""
655
  VALIDATION RESULTS:
656
  {validation_info}
657
 
658
- No search results contained relevant content for this query. Please provide a response indicating that insufficient relevant information was found."""
659
 
660
  content_parts = [f'User Query: "{query}"\n']
661
  content_parts.append(f"Number of relevant sources found: {len(valid_results)}\n")
 
594
  return """You are an expert research assistant. Your task is to analyze search results and provide a comprehensive, accurate summary that directly answers the user's query.
595
 
596
  CRITICAL INSTRUCTIONS:
597
+ 1. Analyze ALL provided content carefully - even if it seems only tangentially related
598
+ 2. Look for connections between the query and the content, even if not immediately obvious
599
+ 3. If content is about a parent company/organization mentioned in the query, include relevant information
600
+ 4. Extract and synthesize any information that could be relevant to answering the user's question
601
  5. Include specific facts, dates, numbers, and quotes when present
602
  6. If information is contradictory between sources, mention this
603
  7. Cite sources by mentioning the publication or website name
604
+ 8. Be thorough and detailed rather than dismissive
605
 
606
+ ONLY state that results are not relevant if they are completely unrelated to any aspect of the query. If there is ANY connection (like parent company info, related business segments, etc.), include that information.
607
 
608
  Format your response as a comprehensive summary, not bullet points."""
609
 
 
612
  valid_results = []
613
  validation_info = []
614
 
615
+ # More intelligent keyword extraction
616
+ query_lower = query.lower()
617
+
618
+ # Extract key entities and terms
619
+ important_keywords = []
620
+
621
+ # Split query into words and extract meaningful terms
622
+ words = query_lower.split()
623
+ for word in words:
624
+ if len(word) > 2 and word not in ['news', 'latest', 'recent', 'update', 'information', 'about']:
625
+ important_keywords.append(word)
626
+
627
+ # Also look for multi-word entities (like company names)
628
+ # Extract potential company/entity names from query
629
+ entity_patterns = [
630
+ r'\b[A-Z][a-z]+ [A-Z][a-z]+(?:\s+[A-Z][a-z]+)*\b', # Proper names
631
+ r'\b[A-Z]{2,}(?:\s+[A-Z][a-z]+)*\b', # Acronyms
632
+ ]
633
+
634
+ for pattern in entity_patterns:
635
+ matches = re.findall(pattern, query)
636
+ for match in matches:
637
+ important_keywords.extend(match.lower().split())
638
+
639
+ # Remove duplicates
640
+ important_keywords = list(set(important_keywords))
641
 
642
  for result in search_results:
643
+ if not result.content or len(result.content.strip()) < 50: # Lowered threshold
644
  validation_info.append(f"Skipped '{result.title}' - insufficient content")
645
  continue
646
 
 
648
  content_lower = result.content.lower()
649
  title_lower = result.title.lower()
650
  snippet_lower = result.snippet.lower()
651
+ combined_text = f"{title_lower} {snippet_lower} {content_lower}"
652
 
653
+ # More flexible relevance scoring
654
  relevant_score = 0
655
+ matched_keywords = []
656
+
657
+ for keyword in important_keywords:
658
+ if keyword in combined_text:
659
  if keyword in content_lower:
660
  relevant_score += 2
661
+ matched_keywords.append(keyword)
662
  elif keyword in title_lower:
663
+ relevant_score += 3 # Title matches are very important
664
+ matched_keywords.append(keyword)
665
  elif keyword in snippet_lower:
666
+ relevant_score += 1
667
+ matched_keywords.append(keyword)
668
 
669
+ # Special handling for acronyms and company names
670
+ # If query contains a company acronym (like KKR), be more lenient
671
+ has_company_match = any(len(kw) <= 4 and kw.isupper() for kw in query.split())
672
+ if has_company_match:
673
+ relevant_score += 1 # Boost score for company-related queries
674
+
675
+ # Lower the threshold and accept more results
676
+ if relevant_score >= 1 or len(matched_keywords) >= 1:
677
  valid_results.append(result)
678
+ validation_info.append(f"✓ '{result.title}' - score: {relevant_score}, matched: {matched_keywords}")
679
  else:
680
+ validation_info.append(f"Skipped '{result.title}' - no relevant keywords found")
681
+
682
+ # If we filtered out too many results, be more lenient
683
+ if len(valid_results) < len(search_results) * 0.3: # If we filtered out more than 70%
684
+ validation_info.append("⚠️ Too many results filtered, being more lenient...")
685
+ # Add back results that have any content
686
+ for result in search_results:
687
+ if result not in valid_results and result.content.strip():
688
+ valid_results.append(result)
689
+ validation_info.append(f"✓ '{result.title}' - added back (lenient mode)")
690
 
691
  validation_summary = "\n".join(validation_info)
692
  return valid_results, validation_summary
 
702
  VALIDATION RESULTS:
703
  {validation_info}
704
 
705
+ The search results did not pass the initial relevance filter, but this might be overly restrictive. Please analyze the raw content provided and extract any information that could be relevant to answering the user's query, even if the connection is not immediately obvious."""
706
 
707
  content_parts = [f'User Query: "{query}"\n']
708
  content_parts.append(f"Number of relevant sources found: {len(valid_results)}\n")