| | |
| | """ |
| | Seed the ChromaDB 'company_policies' collection with sample policy documents. |
| | Run once: python3 seed_vector_store.py |
| | |
| | Requires Ollama with nomic-embed-text: ollama pull nomic-embed-text |
| | """ |
| |
|
| | from pathlib import Path |
| |
|
| | from dotenv import load_dotenv |
| |
|
| | load_dotenv(Path(__file__).parent / ".env") |
| |
|
| | from langchain_chroma import Chroma |
| | from langchain_ollama import OllamaEmbeddings |
| | from langchain_core.documents import Document |
| |
|
| | COLLECTION_NAME = "company_policies" |
| | PERSIST_DIR = Path(__file__).parent / "data" / "chroma_db" |
| |
|
| | SAMPLE_POLICIES = [ |
| | Document( |
| | page_content="""Expense Policy: All business expenses must be pre-approved for amounts over $500. |
| | Submit receipts within 30 days. Air travel must be economy class unless trip exceeds 8 hours. |
| | Maximum daily meal allowance: $75 for domestic, $100 for international.""", |
| | metadata={"source": "expense_policy.pdf", "type": "policy"}, |
| | ), |
| | Document( |
| | page_content="""Remote Work Policy: Employees may work remotely up to 3 days per week with manager approval. |
| | Core hours 10am-3pm local time are required. VPN must be used for all company systems. |
| | Equipment reimbursement up to $500 for home office setup.""", |
| | metadata={"source": "remote_work.md", "type": "policy"}, |
| | ), |
| | Document( |
| | page_content="""Leave Policy: Full-time employees receive 15 days PTO per year, 10 sick days. |
| | Unused PTO carries over up to 5 days. bereavement leave: 5 days. Parental leave: 12 weeks paid.""", |
| | metadata={"source": "leave_policy.pdf", "type": "policy"}, |
| | ), |
| | Document( |
| | page_content="""Data Security Policy: All customer data must be encrypted at rest and in transit. |
| | Access to production databases requires 2FA and manager approval. No PII in logs or error messages. |
| | Incident reporting within 24 hours.""", |
| | metadata={"source": "security_policy.pdf", "type": "policy"}, |
| | ), |
| | Document( |
| | page_content="""Code Review Process: All PRs require 2 approvals before merge. Run tests locally. |
| | No direct commits to main. Use feature branches. Document breaking changes in CHANGELOG.""", |
| | metadata={"source": "engineering_handbook.md", "type": "documentation"}, |
| | ), |
| | ] |
| |
|
| |
|
| | def main(): |
| | persist = PERSIST_DIR |
| | persist.mkdir(parents=True, exist_ok=True) |
| | embeddings = OllamaEmbeddings(model="nomic-embed-text") |
| | |
| | try: |
| | existing = Chroma(collection_name=COLLECTION_NAME, embedding_function=embeddings, persist_directory=str(persist)) |
| | existing.delete_collection() |
| | except Exception: |
| | pass |
| | Chroma.from_documents( |
| | documents=SAMPLE_POLICIES, |
| | embedding=embeddings, |
| | collection_name=COLLECTION_NAME, |
| | persist_directory=str(persist), |
| | ) |
| | print(f"Seeded {len(SAMPLE_POLICIES)} documents into ChromaDB collection '{COLLECTION_NAME}'.") |
| |
|
| |
|
| | if __name__ == "__main__": |
| | main() |
| |
|