Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -594,16 +594,16 @@ class LLMSummarizer:
|
|
| 594 |
return """You are an expert research assistant. Your task is to analyze search results and provide a comprehensive, accurate summary that directly answers the user's query.
|
| 595 |
|
| 596 |
CRITICAL INSTRUCTIONS:
|
| 597 |
-
1.
|
| 598 |
-
2.
|
| 599 |
-
3.
|
| 600 |
-
4.
|
| 601 |
5. Include specific facts, dates, numbers, and quotes when present
|
| 602 |
6. If information is contradictory between sources, mention this
|
| 603 |
7. Cite sources by mentioning the publication or website name
|
| 604 |
-
8. Be
|
| 605 |
|
| 606 |
-
|
| 607 |
|
| 608 |
Format your response as a comprehensive summary, not bullet points."""
|
| 609 |
|
|
@@ -612,10 +612,35 @@ Format your response as a comprehensive summary, not bullet points."""
|
|
| 612 |
valid_results = []
|
| 613 |
validation_info = []
|
| 614 |
|
| 615 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 616 |
|
| 617 |
for result in search_results:
|
| 618 |
-
if not result.content or len(result.content.strip()) <
|
| 619 |
validation_info.append(f"Skipped '{result.title}' - insufficient content")
|
| 620 |
continue
|
| 621 |
|
|
@@ -623,23 +648,45 @@ Format your response as a comprehensive summary, not bullet points."""
|
|
| 623 |
content_lower = result.content.lower()
|
| 624 |
title_lower = result.title.lower()
|
| 625 |
snippet_lower = result.snippet.lower()
|
|
|
|
| 626 |
|
| 627 |
-
#
|
| 628 |
relevant_score = 0
|
| 629 |
-
|
| 630 |
-
|
|
|
|
|
|
|
| 631 |
if keyword in content_lower:
|
| 632 |
relevant_score += 2
|
|
|
|
| 633 |
elif keyword in title_lower:
|
| 634 |
-
relevant_score +=
|
|
|
|
| 635 |
elif keyword in snippet_lower:
|
| 636 |
-
relevant_score +=
|
|
|
|
| 637 |
|
| 638 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 639 |
valid_results.append(result)
|
| 640 |
-
validation_info.append(f"✓ '{result.title}' -
|
| 641 |
else:
|
| 642 |
-
validation_info.append(f"Skipped '{result.title}' -
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 643 |
|
| 644 |
validation_summary = "\n".join(validation_info)
|
| 645 |
return valid_results, validation_summary
|
|
@@ -655,7 +702,7 @@ Format your response as a comprehensive summary, not bullet points."""
|
|
| 655 |
VALIDATION RESULTS:
|
| 656 |
{validation_info}
|
| 657 |
|
| 658 |
-
|
| 659 |
|
| 660 |
content_parts = [f'User Query: "{query}"\n']
|
| 661 |
content_parts.append(f"Number of relevant sources found: {len(valid_results)}\n")
|
|
|
|
| 594 |
return """You are an expert research assistant. Your task is to analyze search results and provide a comprehensive, accurate summary that directly answers the user's query.
|
| 595 |
|
| 596 |
CRITICAL INSTRUCTIONS:
|
| 597 |
+
1. Analyze ALL provided content carefully - even if it seems only tangentially related
|
| 598 |
+
2. Look for connections between the query and the content, even if not immediately obvious
|
| 599 |
+
3. If content is about a parent company/organization mentioned in the query, include relevant information
|
| 600 |
+
4. Extract and synthesize any information that could be relevant to answering the user's question
|
| 601 |
5. Include specific facts, dates, numbers, and quotes when present
|
| 602 |
6. If information is contradictory between sources, mention this
|
| 603 |
7. Cite sources by mentioning the publication or website name
|
| 604 |
+
8. Be thorough and detailed rather than dismissive
|
| 605 |
|
| 606 |
+
ONLY state that results are not relevant if they are completely unrelated to any aspect of the query. If there is ANY connection (like parent company info, related business segments, etc.), include that information.
|
| 607 |
|
| 608 |
Format your response as a comprehensive summary, not bullet points."""
|
| 609 |
|
|
|
|
| 612 |
valid_results = []
|
| 613 |
validation_info = []
|
| 614 |
|
| 615 |
+
# More intelligent keyword extraction
|
| 616 |
+
query_lower = query.lower()
|
| 617 |
+
|
| 618 |
+
# Extract key entities and terms
|
| 619 |
+
important_keywords = []
|
| 620 |
+
|
| 621 |
+
# Split query into words and extract meaningful terms
|
| 622 |
+
words = query_lower.split()
|
| 623 |
+
for word in words:
|
| 624 |
+
if len(word) > 2 and word not in ['news', 'latest', 'recent', 'update', 'information', 'about']:
|
| 625 |
+
important_keywords.append(word)
|
| 626 |
+
|
| 627 |
+
# Also look for multi-word entities (like company names)
|
| 628 |
+
# Extract potential company/entity names from query
|
| 629 |
+
entity_patterns = [
|
| 630 |
+
r'\b[A-Z][a-z]+ [A-Z][a-z]+(?:\s+[A-Z][a-z]+)*\b', # Proper names
|
| 631 |
+
r'\b[A-Z]{2,}(?:\s+[A-Z][a-z]+)*\b', # Acronyms
|
| 632 |
+
]
|
| 633 |
+
|
| 634 |
+
for pattern in entity_patterns:
|
| 635 |
+
matches = re.findall(pattern, query)
|
| 636 |
+
for match in matches:
|
| 637 |
+
important_keywords.extend(match.lower().split())
|
| 638 |
+
|
| 639 |
+
# Remove duplicates
|
| 640 |
+
important_keywords = list(set(important_keywords))
|
| 641 |
|
| 642 |
for result in search_results:
|
| 643 |
+
if not result.content or len(result.content.strip()) < 50: # Lowered threshold
|
| 644 |
validation_info.append(f"Skipped '{result.title}' - insufficient content")
|
| 645 |
continue
|
| 646 |
|
|
|
|
| 648 |
content_lower = result.content.lower()
|
| 649 |
title_lower = result.title.lower()
|
| 650 |
snippet_lower = result.snippet.lower()
|
| 651 |
+
combined_text = f"{title_lower} {snippet_lower} {content_lower}"
|
| 652 |
|
| 653 |
+
# More flexible relevance scoring
|
| 654 |
relevant_score = 0
|
| 655 |
+
matched_keywords = []
|
| 656 |
+
|
| 657 |
+
for keyword in important_keywords:
|
| 658 |
+
if keyword in combined_text:
|
| 659 |
if keyword in content_lower:
|
| 660 |
relevant_score += 2
|
| 661 |
+
matched_keywords.append(keyword)
|
| 662 |
elif keyword in title_lower:
|
| 663 |
+
relevant_score += 3 # Title matches are very important
|
| 664 |
+
matched_keywords.append(keyword)
|
| 665 |
elif keyword in snippet_lower:
|
| 666 |
+
relevant_score += 1
|
| 667 |
+
matched_keywords.append(keyword)
|
| 668 |
|
| 669 |
+
# Special handling for acronyms and company names
|
| 670 |
+
# If query contains a company acronym (like KKR), be more lenient
|
| 671 |
+
has_company_match = any(len(kw) <= 4 and kw.isupper() for kw in query.split())
|
| 672 |
+
if has_company_match:
|
| 673 |
+
relevant_score += 1 # Boost score for company-related queries
|
| 674 |
+
|
| 675 |
+
# Lower the threshold and accept more results
|
| 676 |
+
if relevant_score >= 1 or len(matched_keywords) >= 1:
|
| 677 |
valid_results.append(result)
|
| 678 |
+
validation_info.append(f"✓ '{result.title}' - score: {relevant_score}, matched: {matched_keywords}")
|
| 679 |
else:
|
| 680 |
+
validation_info.append(f"Skipped '{result.title}' - no relevant keywords found")
|
| 681 |
+
|
| 682 |
+
# If we filtered out too many results, be more lenient
|
| 683 |
+
if len(valid_results) < len(search_results) * 0.3: # If we filtered out more than 70%
|
| 684 |
+
validation_info.append("⚠️ Too many results filtered, being more lenient...")
|
| 685 |
+
# Add back results that have any content
|
| 686 |
+
for result in search_results:
|
| 687 |
+
if result not in valid_results and result.content.strip():
|
| 688 |
+
valid_results.append(result)
|
| 689 |
+
validation_info.append(f"✓ '{result.title}' - added back (lenient mode)")
|
| 690 |
|
| 691 |
validation_summary = "\n".join(validation_info)
|
| 692 |
return valid_results, validation_summary
|
|
|
|
| 702 |
VALIDATION RESULTS:
|
| 703 |
{validation_info}
|
| 704 |
|
| 705 |
+
The search results did not pass the initial relevance filter, but this might be overly restrictive. Please analyze the raw content provided and extract any information that could be relevant to answering the user's query, even if the connection is not immediately obvious."""
|
| 706 |
|
| 707 |
content_parts = [f'User Query: "{query}"\n']
|
| 708 |
content_parts.append(f"Number of relevant sources found: {len(valid_results)}\n")
|