Autonomous_Data_Scientist / seed_vector_store.py
Megha Panicker
Resolve README.md conflict with HF Spaces metadata
c1b226b
#!/usr/bin/env python3
"""
Seed the ChromaDB 'company_policies' collection with sample policy documents.
Run once: python3 seed_vector_store.py
Requires Ollama with nomic-embed-text: ollama pull nomic-embed-text
"""
from pathlib import Path
from dotenv import load_dotenv
load_dotenv(Path(__file__).parent / ".env")
from langchain_chroma import Chroma
from langchain_ollama import OllamaEmbeddings
from langchain_core.documents import Document
COLLECTION_NAME = "company_policies"
PERSIST_DIR = Path(__file__).parent / "data" / "chroma_db"
SAMPLE_POLICIES = [
Document(
page_content="""Expense Policy: All business expenses must be pre-approved for amounts over $500.
Submit receipts within 30 days. Air travel must be economy class unless trip exceeds 8 hours.
Maximum daily meal allowance: $75 for domestic, $100 for international.""",
metadata={"source": "expense_policy.pdf", "type": "policy"},
),
Document(
page_content="""Remote Work Policy: Employees may work remotely up to 3 days per week with manager approval.
Core hours 10am-3pm local time are required. VPN must be used for all company systems.
Equipment reimbursement up to $500 for home office setup.""",
metadata={"source": "remote_work.md", "type": "policy"},
),
Document(
page_content="""Leave Policy: Full-time employees receive 15 days PTO per year, 10 sick days.
Unused PTO carries over up to 5 days. bereavement leave: 5 days. Parental leave: 12 weeks paid.""",
metadata={"source": "leave_policy.pdf", "type": "policy"},
),
Document(
page_content="""Data Security Policy: All customer data must be encrypted at rest and in transit.
Access to production databases requires 2FA and manager approval. No PII in logs or error messages.
Incident reporting within 24 hours.""",
metadata={"source": "security_policy.pdf", "type": "policy"},
),
Document(
page_content="""Code Review Process: All PRs require 2 approvals before merge. Run tests locally.
No direct commits to main. Use feature branches. Document breaking changes in CHANGELOG.""",
metadata={"source": "engineering_handbook.md", "type": "documentation"},
),
]
def main():
persist = PERSIST_DIR
persist.mkdir(parents=True, exist_ok=True)
embeddings = OllamaEmbeddings(model="nomic-embed-text")
# Idempotent: delete existing collection to avoid duplicates
try:
existing = Chroma(collection_name=COLLECTION_NAME, embedding_function=embeddings, persist_directory=str(persist))
existing.delete_collection()
except Exception:
pass
Chroma.from_documents(
documents=SAMPLE_POLICIES,
embedding=embeddings,
collection_name=COLLECTION_NAME,
persist_directory=str(persist),
)
print(f"Seeded {len(SAMPLE_POLICIES)} documents into ChromaDB collection '{COLLECTION_NAME}'.")
if __name__ == "__main__":
main()