Upload 5 files

bff8572 verified 2 months ago

5.51 kB

	#!/usr/bin/env python3
	"""
	RAG-Helper: minimal, reproducible toy script for AI-SEO retrieval demos.
	- Fetches a URL
	- Extracts text
	- Chunks ~300 "tokens" (word approximation)
	- Creates embeddings (sentence-transformers)
	- (Optional) Upserts into Qdrant
	- Generates a short "copy-cite" answer block with footnotes
	"""

	import argparse, re, uuid, json, os
	from typing import List, Dict
	import requests
	from bs4 import BeautifulSoup
	from tqdm import tqdm
	import numpy as np

	try:
	from sentence_transformers import SentenceTransformer
	except Exception:
	raise SystemExit("Please install requirements: pip install -r requirements.txt")


	def fetch_url(url: str) -> str:
	r = requests.get(url, timeout=30)
	r.raise_for_status()
	return r.text


	def html_to_text(html: str) -> str:
	soup = BeautifulSoup(html, "html.parser")
	for tag in soup(["script", "style", "noscript"]):
	tag.decompose()
	text = soup.get_text(separator=" ")
	return re.sub(r"\s+", " ", text).strip()


	def chunk_text(text: str, target_tokens: int = 300) -> List[str]:
	words = text.split()
	chunks = []
	for i in range(0, len(words), target_tokens):
	chunk = " ".join(words[i:i+target_tokens])
	if chunk:
	chunks.append(chunk)
	return chunks


	def embed_chunks(chunks: List[str], model_name: str = "sentence-transformers/all-MiniLM-L6-v2") -> np.ndarray:
	model = SentenceTransformer(model_name)
	return model.encode(chunks, batch_size=32, show_progress_bar=True,
	convert_to_numpy=True, normalize_embeddings=True)


	def build_payload(chunks: List[str], embs: np.ndarray, source_url: str, entity: str = "", sector: str = "") -> List[Dict]:
	vectors = []
	for idx, (c, v) in enumerate(zip(chunks, embs)):
	vectors.append({
	"id": str(uuid.uuid4()),
	"text": c,
	"vector": v.tolist(),
	"metadata": {
	"source": source_url,
	"entity": entity,
	"sector": sector,
	"position": idx
	}
	})
	return vectors


	def optional_qdrant_upsert(vectors: List[Dict], collection: str, qdrant_url: str = None, api_key: str = None):
	try:
	from qdrant_client import QdrantClient
	from qdrant_client.models import PointStruct, Distance, VectorParams
	except Exception:
	print("qdrant-client not installed; skipping vector DB upsert.")
	return

	client = QdrantClient(url=qdrant_url or "http://localhost:6333", api_key=api_key)
	dim = len(vectors[0]["vector"])

	try:
	client.get_collection(collection)
	except Exception:
	client.recreate_collection(
	collection_name=collection,
	vectors_config=VectorParams(size=dim, distance=Distance.COSINE),
	)

	points = [PointStruct(id=v["id"], vector=v["vector"], payload=v["metadata"] \| {"text": v["text"]}) for v in vectors]
	client.upsert(collection_name=collection, points=points)
	print(f"Upserted {len(points)} vectors into Qdrant collection '{collection}'.")


	def make_copy_cite(vectors: List[Dict], k: int = 3) -> str:
	top = vectors[:k]
	bullets = []
	for i, v in enumerate(top, start=1):
	snippet = v["text"][:280] + ("..." if len(v["text"]) > 280 else "")
	bullets.append(f"- {snippet} [{i}]")
	footnotes = "\n".join([f"[{i}] {v['metadata']['source']}" for i, v in enumerate(top, start=1)])
	return f"Answer (draft):\n" + "\n".join(bullets) + "\n\n" + footnotes


	def main():
	ap = argparse.ArgumentParser(description="NebulaTech RAG-Helper (toy)")
	ap.add_argument("--url", required=True, help="Public URL to ingest")
	ap.add_argument("--entity", default="", help="Primary entity (brand/product/topic)")
	ap.add_argument("--sector", default="", help="Sector tag (e.g., architecture, pharma)")
	ap.add_argument("--qdrant-url", default=None, help="Qdrant endpoint (optional)")
	ap.add_argument("--qdrant-key", default=None, help="Qdrant API key (optional)")
	ap.add_argument("--collection", default="nebula_rag_helper", help="Qdrant collection name")
	ap.add_argument("--out", default="output.jsonl", help="Local JSONL output")
	args = ap.parse_args()

	print(f"[1/5] Fetching: {args.url}")
	html = fetch_url(args.url)
	text = html_to_text(html)

	print("[2/5] Chunking ~300 tokens...")
	chunks = chunk_text(text)
	if not chunks:
	raise SystemExit("No text extracted; aborting.")

	print(f"[3/5] Embedding {len(chunks)} chunks...")
	embs = embed_chunks(chunks)

	print("[4/5] Building vectors + metadata...")
	vectors = build_payload(chunks, embs, source_url=args.url, entity=args.entity, sector=args.sector)

	if args.qdrant_url:
	optional_qdrant_upsert(vectors, collection=args.collection, qdrant_url=args.qdrant_url, api_key=args.qdrant_key)

	with open(args.out, "w", encoding="utf-8") as f:
	for v in vectors:
	f.write(json.dumps(v, ensure_ascii=False) + "\n")
	print(f"Wrote {len(vectors)} vectors to {args.out}")

	copy_cite = make_copy_cite(vectors, k=3)
	cc_path = os.path.splitext(args.out)[0] + "_copycite.md"
	with open(cc_path, "w", encoding="utf-8") as f:
	f.write(copy_cite)
	print(f"Generated copy-cite block at {cc_path}")


	if __name__ == "__main__":
	main()