Pulastya0 commited on
Commit
6806256
Β·
1 Parent(s): b990ece

Update populate_chroma.py

Browse files
Files changed (1) hide show
  1. populate_chroma.py +14 -23
populate_chroma.py CHANGED
@@ -1,40 +1,31 @@
1
- import os
2
  import json
3
  import chromadb
4
  import firebase_admin
5
  from firebase_admin import credentials, firestore
6
  from encoder import SentenceEncoder
7
 
8
- def initialize_firebase():
9
- """Initializes the Firebase connection using Hugging Face secrets."""
10
- # Get the JSON credentials from the environment variable
11
- creds_json_string = os.getenv("FIREBASE_CREDS_JSON")
12
-
13
- if not creds_json_string:
14
- print("❌ FIREBASE_CREDS_JSON secret not found. Cannot initialize Firebase.")
15
- return None
16
-
17
  try:
18
- # Convert the JSON string back into a dictionary
19
- creds_dict = json.loads(creds_json_string)
20
- cred = credentials.Certificate(creds_dict)
21
 
22
- # Initialize the app (check if it's already initialized)
23
  if not firebase_admin._apps:
24
  firebase_admin.initialize_app(cred)
25
 
26
  db = firestore.client()
27
- print("βœ… Firebase connection initialized successfully.")
28
  return db
29
  except Exception as e:
30
- print(f"❌ Could not initialize Firebase. Error: {e}")
 
31
  return None
32
 
33
  def populate_vector_db():
34
  """
35
  Reads internships from Firestore, generates embeddings, and populates ChromaDB.
36
  """
37
- db = initialize_firebase()
38
  if db is None:
39
  return
40
 
@@ -43,21 +34,21 @@ def populate_vector_db():
43
  chroma_client = chromadb.PersistentClient(path="/data/chroma_db")
44
  collection = chroma_client.get_or_create_collection(name="internships")
45
 
46
- # 2. Clear existing data in ChromaDB
47
  if collection.count() > 0:
48
  print(f"ℹ️ Clearing {collection.count()} existing items from ChromaDB.")
49
  collection.delete(ids=collection.get()['ids'])
50
 
51
- # 3. Fetch all data from Firestore
52
  print("πŸ“š Reading internship data from Firestore...")
53
  internships_ref = db.collection('internships').stream()
54
  internships = [doc.to_dict() for doc in internships_ref]
55
 
56
  if not internships:
57
- print("❌ No internship data found in Firestore to process.")
58
  return
59
 
60
- # 4. Generate embeddings and prepare data for ChromaDB
61
  print(f"🧠 Generating embeddings for {len(internships)} internships...")
62
  texts = [f"{i['title']}. {i['description']}. Skills: {', '.join(i['skills'])}" for i in internships]
63
  embeddings = encoder.encode(texts, show_progress_bar=True).tolist()
@@ -67,8 +58,8 @@ def populate_vector_db():
67
  for i in internships:
68
  i['skills'] = json.dumps(i['skills'])
69
  metadatas.append(i)
70
-
71
- # 5. Add data to ChromaDB
72
  print("βž• Adding data to ChromaDB...")
73
  collection.add(ids=ids, embeddings=embeddings, metadatas=metadatas)
74
  print(f"βœ… Successfully populated ChromaDB with {collection.count()} items.")
 
 
1
  import json
2
  import chromadb
3
  import firebase_admin
4
  from firebase_admin import credentials, firestore
5
  from encoder import SentenceEncoder
6
 
7
+ def initialize_firebase_with_file():
8
+ """Initializes Firebase using a local serviceAccountKey.json file."""
 
 
 
 
 
 
 
9
  try:
10
+ # Use the service account key file
11
+ cred = credentials.Certificate("serviceAccountKey.json")
 
12
 
 
13
  if not firebase_admin._apps:
14
  firebase_admin.initialize_app(cred)
15
 
16
  db = firestore.client()
17
+ print("βœ… Firebase connection initialized from file.")
18
  return db
19
  except Exception as e:
20
+ print(f"❌ Could not initialize Firebase from file. Error: {e}")
21
+ print(" - Make sure 'serviceAccountKey.json' has been uploaded to the terminal.")
22
  return None
23
 
24
  def populate_vector_db():
25
  """
26
  Reads internships from Firestore, generates embeddings, and populates ChromaDB.
27
  """
28
+ db = initialize_firebase_with_file()
29
  if db is None:
30
  return
31
 
 
34
  chroma_client = chromadb.PersistentClient(path="/data/chroma_db")
35
  collection = chroma_client.get_or_create_collection(name="internships")
36
 
37
+ # 2. Clear existing data
38
  if collection.count() > 0:
39
  print(f"ℹ️ Clearing {collection.count()} existing items from ChromaDB.")
40
  collection.delete(ids=collection.get()['ids'])
41
 
42
+ # 3. Fetch data from Firestore
43
  print("πŸ“š Reading internship data from Firestore...")
44
  internships_ref = db.collection('internships').stream()
45
  internships = [doc.to_dict() for doc in internships_ref]
46
 
47
  if not internships:
48
+ print("❌ No internship data found in Firestore.")
49
  return
50
 
51
+ # 4. Generate embeddings
52
  print(f"🧠 Generating embeddings for {len(internships)} internships...")
53
  texts = [f"{i['title']}. {i['description']}. Skills: {', '.join(i['skills'])}" for i in internships]
54
  embeddings = encoder.encode(texts, show_progress_bar=True).tolist()
 
58
  for i in internships:
59
  i['skills'] = json.dumps(i['skills'])
60
  metadatas.append(i)
61
+
62
+ # 5. Add to ChromaDB
63
  print("βž• Adding data to ChromaDB...")
64
  collection.add(ids=ids, embeddings=embeddings, metadatas=metadatas)
65
  print(f"βœ… Successfully populated ChromaDB with {collection.count()} items.")