File size: 5,510 Bytes
bff8572
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
#!/usr/bin/env python3
"""

RAG-Helper: minimal, reproducible toy script for AI-SEO retrieval demos.

- Fetches a URL

- Extracts text

- Chunks ~300 "tokens" (word approximation)

- Creates embeddings (sentence-transformers)

- (Optional) Upserts into Qdrant

- Generates a short "copy-cite" answer block with footnotes

"""

import argparse, re, uuid, json, os
from typing import List, Dict
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm
import numpy as np

try:
    from sentence_transformers import SentenceTransformer
except Exception:
    raise SystemExit("Please install requirements: pip install -r requirements.txt")


def fetch_url(url: str) -> str:
    r = requests.get(url, timeout=30)
    r.raise_for_status()
    return r.text


def html_to_text(html: str) -> str:
    soup = BeautifulSoup(html, "html.parser")
    for tag in soup(["script", "style", "noscript"]):
        tag.decompose()
    text = soup.get_text(separator=" ")
    return re.sub(r"\s+", " ", text).strip()


def chunk_text(text: str, target_tokens: int = 300) -> List[str]:
    words = text.split()
    chunks = []
    for i in range(0, len(words), target_tokens):
        chunk = " ".join(words[i:i+target_tokens])
        if chunk:
            chunks.append(chunk)
    return chunks


def embed_chunks(chunks: List[str], model_name: str = "sentence-transformers/all-MiniLM-L6-v2") -> np.ndarray:
    model = SentenceTransformer(model_name)
    return model.encode(chunks, batch_size=32, show_progress_bar=True,
                        convert_to_numpy=True, normalize_embeddings=True)


def build_payload(chunks: List[str], embs: np.ndarray, source_url: str, entity: str = "", sector: str = "") -> List[Dict]:
    vectors = []
    for idx, (c, v) in enumerate(zip(chunks, embs)):
        vectors.append({
            "id": str(uuid.uuid4()),
            "text": c,
            "vector": v.tolist(),
            "metadata": {
                "source": source_url,
                "entity": entity,
                "sector": sector,
                "position": idx
            }
        })
    return vectors


def optional_qdrant_upsert(vectors: List[Dict], collection: str, qdrant_url: str = None, api_key: str = None):
    try:
        from qdrant_client import QdrantClient
        from qdrant_client.models import PointStruct, Distance, VectorParams
    except Exception:
        print("qdrant-client not installed; skipping vector DB upsert.")
        return

    client = QdrantClient(url=qdrant_url or "http://localhost:6333", api_key=api_key)
    dim = len(vectors[0]["vector"])

    try:
        client.get_collection(collection)
    except Exception:
        client.recreate_collection(
            collection_name=collection,
            vectors_config=VectorParams(size=dim, distance=Distance.COSINE),
        )

    points = [PointStruct(id=v["id"], vector=v["vector"], payload=v["metadata"] | {"text": v["text"]}) for v in vectors]
    client.upsert(collection_name=collection, points=points)
    print(f"Upserted {len(points)} vectors into Qdrant collection '{collection}'.")


def make_copy_cite(vectors: List[Dict], k: int = 3) -> str:
    top = vectors[:k]
    bullets = []
    for i, v in enumerate(top, start=1):
        snippet = v["text"][:280] + ("..." if len(v["text"]) > 280 else "")
        bullets.append(f"- {snippet} [{i}]")
    footnotes = "\n".join([f"[{i}] {v['metadata']['source']}" for i, v in enumerate(top, start=1)])
    return f"**Answer (draft):**\n" + "\n".join(bullets) + "\n\n" + footnotes


def main():
    ap = argparse.ArgumentParser(description="NebulaTech RAG-Helper (toy)")
    ap.add_argument("--url", required=True, help="Public URL to ingest")
    ap.add_argument("--entity", default="", help="Primary entity (brand/product/topic)")
    ap.add_argument("--sector", default="", help="Sector tag (e.g., architecture, pharma)")
    ap.add_argument("--qdrant-url", default=None, help="Qdrant endpoint (optional)")
    ap.add_argument("--qdrant-key", default=None, help="Qdrant API key (optional)")
    ap.add_argument("--collection", default="nebula_rag_helper", help="Qdrant collection name")
    ap.add_argument("--out", default="output.jsonl", help="Local JSONL output")
    args = ap.parse_args()

    print(f"[1/5] Fetching: {args.url}")
    html = fetch_url(args.url)
    text = html_to_text(html)

    print("[2/5] Chunking ~300 tokens...")
    chunks = chunk_text(text)
    if not chunks:
        raise SystemExit("No text extracted; aborting.")

    print(f"[3/5] Embedding {len(chunks)} chunks...")
    embs = embed_chunks(chunks)

    print("[4/5] Building vectors + metadata...")
    vectors = build_payload(chunks, embs, source_url=args.url, entity=args.entity, sector=args.sector)

    if args.qdrant_url:
        optional_qdrant_upsert(vectors, collection=args.collection, qdrant_url=args.qdrant_url, api_key=args.qdrant_key)

    with open(args.out, "w", encoding="utf-8") as f:
        for v in vectors:
            f.write(json.dumps(v, ensure_ascii=False) + "\n")
    print(f"Wrote {len(vectors)} vectors to {args.out}")

    copy_cite = make_copy_cite(vectors, k=3)
    cc_path = os.path.splitext(args.out)[0] + "_copycite.md"
    with open(cc_path, "w", encoding="utf-8") as f:
        f.write(copy_cite)
    print(f"Generated copy-cite block at {cc_path}")


if __name__ == "__main__":
    main()