Documentation Index
Fetch the complete documentation index at: https://docs.clarifeye.ai/llms.txt
Use this file to discover all available pages before exploring further.
Object extractors use LLMs to extract structured entities from documents based on Pydantic models.
# List all object extractors
extractors = warehouse.list_object_extractors()
for extractor in extractors:
print(f"Extractor: {extractor['name']} (ID: {extractor['id']})")
from pydantic import BaseModel
from typing import Optional
# Define your Pydantic model
class Invoice(BaseModel):
"""An invoice extracted from a document"""
invoice_number: str
date: str
vendor: str
total_amount: float
currency: str
line_items: list[dict]
notes: Optional[str] = None
# Create an object extractor
extractor = warehouse.create_object_extractor(
name="Invoice Extractor",
brief="Extract invoice data from documents",
extractable_object=Invoice, # Your Pydantic model
extraction_prompt="Extract all invoice information from the document...",
llm_model="gpt-4o",
compute_alerts=True,
block_grouping_config={"type": "page_window", "window_size_pages": 5},
add_anchoring_object=True
)
print(f"Created extractor: {extractor['id']}")
# Update an existing object extractor
updated = warehouse.update_object_extractor(
object_extractor_id=extractor['id'],
name="Updated Invoice Extractor",
extraction_prompt="Updated extraction prompt...",
llm_model="claude-3-5-sonnet-20241022",
compute_alerts=False
)
# Run extraction on all documents
task = warehouse.run_object_extractor(
object_extractor_id=extractor['id']
)
result = task.wait_for_completion()
# Run on specific documents
task = warehouse.run_object_extractor(
object_extractor_id=extractor['id'],
document_ids=["doc-id-1", "doc-id-2"]
)
# Delete an object extractor
warehouse.delete_object_extractor(object_extractor_id=extractor['id'])