File size: 5,510 Bytes

bff8572

#!/usr/bin/env python3
"""

RAG-Helper: minimal, reproducible toy script for AI-SEO retrieval demos.

- Fetches a URL

- Extracts text

- Chunks ~300 "tokens" (word approximation)

- Creates embeddings (sentence-transformers)

- (Optional) Upserts into Qdrant

- Generates a short "copy-cite" answer block with footnotes

"""

import argparse, re, uuid, json, os
from typing import List, Dict
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm
import numpy as np

try:
    from sentence_transformers import SentenceTransformer
except Exception:
    raise SystemExit("Please install requirements: pip install -r requirements.txt")


def fetch_url(url: str) -> str:
    r = requests.get(url, timeout=30)
    r.raise_for_status()
    return r.text


def html_to_text(html: str) -> str:
    soup = BeautifulSoup(html, "html.parser")
    for tag in soup(["script", "style", "noscript"]):
        tag.decompose()
    text = soup.get_text(separator=" ")
    return re.sub(r"\s+", " ", text).strip()


def chunk_text(text: str, target_tokens: int = 300) -> List[str]:
    words = text.split()
    chunks = []
    for i in range(0, len(words), target_tokens):
        chunk = " ".join(words[i:i+target_tokens])
        if chunk:
            chunks.append(chunk)
    return chunks


def embed_chunks(chunks: List[str], model_name: str = "sentence-transformers/all-MiniLM-L6-v2") -> np.ndarray:
    model = SentenceTransformer(model_name)
    return model.encode(chunks, batch_size=32, show_progress_bar=True,
                        convert_to_numpy=True, normalize_embeddings=True)


def build_payload(chunks: List[str], embs: np.ndarray, source_url: str, entity: str = "", sector: str = "") -> List[Dict]:
    vectors = []
    for idx, (c, v) in enumerate(zip(chunks, embs)):
        vectors.append({
            "id": str(uuid.uuid4()),
            "text": c,
            "vector": v.tolist(),
            "metadata": {
                "source": source_url,
                "entity": entity,
                "sector": sector,
                "position": idx
            }
        })
    return vectors


def optional_qdrant_upsert(vectors: List[Dict], collection: str, qdrant_url: str = None, api_key: str = None):
    try:
        from qdrant_client import QdrantClient
        from qdrant_client.models import PointStruct, Distance, VectorParams
    except Exception:
        print("qdrant-client not installed; skipping vector DB upsert.")
        return

    client = QdrantClient(url=qdrant_url or "http://localhost:6333", api_key=api_key)
    dim = len(vectors[0]["vector"])

    try:
        client.get_collection(collection)
    except Exception:
        client.recreate_collection(
            collection_name=collection,
            vectors_config=VectorParams(size=dim, distance=Distance.COSINE),
        )

    points = [PointStruct(id=v["id"], vector=v["vector"], payload=v["metadata"] | {"text": v["text"]}) for v in vectors]
    client.upsert(collection_name=collection, points=points)
    print(f"Upserted {len(points)} vectors into Qdrant collection '{collection}'.")


def make_copy_cite(vectors: List[Dict], k: int = 3) -> str:
    top = vectors[:k]
    bullets = []
    for i, v in enumerate(top, start=1):
        snippet = v["text"][:280] + ("..." if len(v["text"]) > 280 else "")
        bullets.append(f"- {snippet} [{i}]")
    footnotes = "\n".join([f"[{i}] {v['metadata']['source']}" for i, v in enumerate(top, start=1)])
    return f"**Answer (draft):**\n" + "\n".join(bullets) + "\n\n" + footnotes


def main():
    ap = argparse.ArgumentParser(description="NebulaTech RAG-Helper (toy)")
    ap.add_argument("--url", required=True, help="Public URL to ingest")
    ap.add_argument("--entity", default="", help="Primary entity (brand/product/topic)")
    ap.add_argument("--sector", default="", help="Sector tag (e.g., architecture, pharma)")
    ap.add_argument("--qdrant-url", default=None, help="Qdrant endpoint (optional)")
    ap.add_argument("--qdrant-key", default=None, help="Qdrant API key (optional)")
    ap.add_argument("--collection", default="nebula_rag_helper", help="Qdrant collection name")
    ap.add_argument("--out", default="output.jsonl", help="Local JSONL output")
    args = ap.parse_args()

    print(f"[1/5] Fetching: {args.url}")
    html = fetch_url(args.url)
    text = html_to_text(html)

    print("[2/5] Chunking ~300 tokens...")
    chunks = chunk_text(text)
    if not chunks:
        raise SystemExit("No text extracted; aborting.")

    print(f"[3/5] Embedding {len(chunks)} chunks...")
    embs = embed_chunks(chunks)

    print("[4/5] Building vectors + metadata...")
    vectors = build_payload(chunks, embs, source_url=args.url, entity=args.entity, sector=args.sector)

    if args.qdrant_url:
        optional_qdrant_upsert(vectors, collection=args.collection, qdrant_url=args.qdrant_url, api_key=args.qdrant_key)

    with open(args.out, "w", encoding="utf-8") as f:
        for v in vectors:
            f.write(json.dumps(v, ensure_ascii=False) + "\n")
    print(f"Wrote {len(vectors)} vectors to {args.out}")

    copy_cite = make_copy_cite(vectors, k=3)
    cc_path = os.path.splitext(args.out)[0] + "_copycite.md"
    with open(cc_path, "w", encoding="utf-8") as f:
        f.write(copy_cite)
    print(f"Generated copy-cite block at {cc_path}")


if __name__ == "__main__":
    main()