#!/usr/bin/env python3 """ Seed the ChromaDB 'company_policies' collection with sample policy documents. Run once: python3 seed_vector_store.py Requires Ollama with nomic-embed-text: ollama pull nomic-embed-text """ from pathlib import Path from dotenv import load_dotenv load_dotenv(Path(__file__).parent / ".env") from langchain_chroma import Chroma from langchain_ollama import OllamaEmbeddings from langchain_core.documents import Document COLLECTION_NAME = "company_policies" PERSIST_DIR = Path(__file__).parent / "data" / "chroma_db" SAMPLE_POLICIES = [ Document( page_content="""Expense Policy: All business expenses must be pre-approved for amounts over $500. Submit receipts within 30 days. Air travel must be economy class unless trip exceeds 8 hours. Maximum daily meal allowance: $75 for domestic, $100 for international.""", metadata={"source": "expense_policy.pdf", "type": "policy"}, ), Document( page_content="""Remote Work Policy: Employees may work remotely up to 3 days per week with manager approval. Core hours 10am-3pm local time are required. VPN must be used for all company systems. Equipment reimbursement up to $500 for home office setup.""", metadata={"source": "remote_work.md", "type": "policy"}, ), Document( page_content="""Leave Policy: Full-time employees receive 15 days PTO per year, 10 sick days. Unused PTO carries over up to 5 days. bereavement leave: 5 days. Parental leave: 12 weeks paid.""", metadata={"source": "leave_policy.pdf", "type": "policy"}, ), Document( page_content="""Data Security Policy: All customer data must be encrypted at rest and in transit. Access to production databases requires 2FA and manager approval. No PII in logs or error messages. Incident reporting within 24 hours.""", metadata={"source": "security_policy.pdf", "type": "policy"}, ), Document( page_content="""Code Review Process: All PRs require 2 approvals before merge. Run tests locally. No direct commits to main. Use feature branches. Document breaking changes in CHANGELOG.""", metadata={"source": "engineering_handbook.md", "type": "documentation"}, ), ] def main(): persist = PERSIST_DIR persist.mkdir(parents=True, exist_ok=True) embeddings = OllamaEmbeddings(model="nomic-embed-text") # Idempotent: delete existing collection to avoid duplicates try: existing = Chroma(collection_name=COLLECTION_NAME, embedding_function=embeddings, persist_directory=str(persist)) existing.delete_collection() except Exception: pass Chroma.from_documents( documents=SAMPLE_POLICIES, embedding=embeddings, collection_name=COLLECTION_NAME, persist_directory=str(persist), ) print(f"Seeded {len(SAMPLE_POLICIES)} documents into ChromaDB collection '{COLLECTION_NAME}'.") if __name__ == "__main__": main()