nebulatech-rag-helper / rag_seo.py
nebulatech-in's picture
Upload 5 files
bff8572 verified
#!/usr/bin/env python3
"""
RAG-Helper: minimal, reproducible toy script for AI-SEO retrieval demos.
- Fetches a URL
- Extracts text
- Chunks ~300 "tokens" (word approximation)
- Creates embeddings (sentence-transformers)
- (Optional) Upserts into Qdrant
- Generates a short "copy-cite" answer block with footnotes
"""
import argparse, re, uuid, json, os
from typing import List, Dict
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm
import numpy as np
try:
from sentence_transformers import SentenceTransformer
except Exception:
raise SystemExit("Please install requirements: pip install -r requirements.txt")
def fetch_url(url: str) -> str:
r = requests.get(url, timeout=30)
r.raise_for_status()
return r.text
def html_to_text(html: str) -> str:
soup = BeautifulSoup(html, "html.parser")
for tag in soup(["script", "style", "noscript"]):
tag.decompose()
text = soup.get_text(separator=" ")
return re.sub(r"\s+", " ", text).strip()
def chunk_text(text: str, target_tokens: int = 300) -> List[str]:
words = text.split()
chunks = []
for i in range(0, len(words), target_tokens):
chunk = " ".join(words[i:i+target_tokens])
if chunk:
chunks.append(chunk)
return chunks
def embed_chunks(chunks: List[str], model_name: str = "sentence-transformers/all-MiniLM-L6-v2") -> np.ndarray:
model = SentenceTransformer(model_name)
return model.encode(chunks, batch_size=32, show_progress_bar=True,
convert_to_numpy=True, normalize_embeddings=True)
def build_payload(chunks: List[str], embs: np.ndarray, source_url: str, entity: str = "", sector: str = "") -> List[Dict]:
vectors = []
for idx, (c, v) in enumerate(zip(chunks, embs)):
vectors.append({
"id": str(uuid.uuid4()),
"text": c,
"vector": v.tolist(),
"metadata": {
"source": source_url,
"entity": entity,
"sector": sector,
"position": idx
}
})
return vectors
def optional_qdrant_upsert(vectors: List[Dict], collection: str, qdrant_url: str = None, api_key: str = None):
try:
from qdrant_client import QdrantClient
from qdrant_client.models import PointStruct, Distance, VectorParams
except Exception:
print("qdrant-client not installed; skipping vector DB upsert.")
return
client = QdrantClient(url=qdrant_url or "http://localhost:6333", api_key=api_key)
dim = len(vectors[0]["vector"])
try:
client.get_collection(collection)
except Exception:
client.recreate_collection(
collection_name=collection,
vectors_config=VectorParams(size=dim, distance=Distance.COSINE),
)
points = [PointStruct(id=v["id"], vector=v["vector"], payload=v["metadata"] | {"text": v["text"]}) for v in vectors]
client.upsert(collection_name=collection, points=points)
print(f"Upserted {len(points)} vectors into Qdrant collection '{collection}'.")
def make_copy_cite(vectors: List[Dict], k: int = 3) -> str:
top = vectors[:k]
bullets = []
for i, v in enumerate(top, start=1):
snippet = v["text"][:280] + ("..." if len(v["text"]) > 280 else "")
bullets.append(f"- {snippet} [{i}]")
footnotes = "\n".join([f"[{i}] {v['metadata']['source']}" for i, v in enumerate(top, start=1)])
return f"**Answer (draft):**\n" + "\n".join(bullets) + "\n\n" + footnotes
def main():
ap = argparse.ArgumentParser(description="NebulaTech RAG-Helper (toy)")
ap.add_argument("--url", required=True, help="Public URL to ingest")
ap.add_argument("--entity", default="", help="Primary entity (brand/product/topic)")
ap.add_argument("--sector", default="", help="Sector tag (e.g., architecture, pharma)")
ap.add_argument("--qdrant-url", default=None, help="Qdrant endpoint (optional)")
ap.add_argument("--qdrant-key", default=None, help="Qdrant API key (optional)")
ap.add_argument("--collection", default="nebula_rag_helper", help="Qdrant collection name")
ap.add_argument("--out", default="output.jsonl", help="Local JSONL output")
args = ap.parse_args()
print(f"[1/5] Fetching: {args.url}")
html = fetch_url(args.url)
text = html_to_text(html)
print("[2/5] Chunking ~300 tokens...")
chunks = chunk_text(text)
if not chunks:
raise SystemExit("No text extracted; aborting.")
print(f"[3/5] Embedding {len(chunks)} chunks...")
embs = embed_chunks(chunks)
print("[4/5] Building vectors + metadata...")
vectors = build_payload(chunks, embs, source_url=args.url, entity=args.entity, sector=args.sector)
if args.qdrant_url:
optional_qdrant_upsert(vectors, collection=args.collection, qdrant_url=args.qdrant_url, api_key=args.qdrant_key)
with open(args.out, "w", encoding="utf-8") as f:
for v in vectors:
f.write(json.dumps(v, ensure_ascii=False) + "\n")
print(f"Wrote {len(vectors)} vectors to {args.out}")
copy_cite = make_copy_cite(vectors, k=3)
cc_path = os.path.splitext(args.out)[0] + "_copycite.md"
with open(cc_path, "w", encoding="utf-8") as f:
f.write(copy_cite)
print(f"Generated copy-cite block at {cc_path}")
if __name__ == "__main__":
main()