Skip to main content

Documentation Index

Fetch the complete documentation index at: https://docs.clarifeye.ai/llms.txt

Use this file to discover all available pages before exploring further.

Object extractors use LLMs to extract structured entities from documents based on Pydantic models.

List Object Extractors

# List all object extractors
extractors = warehouse.list_object_extractors()
for extractor in extractors:
    print(f"Extractor: {extractor['name']} (ID: {extractor['id']})")

Create Object Extractor

from pydantic import BaseModel
from typing import Optional

# Define your Pydantic model
class Invoice(BaseModel):
    """An invoice extracted from a document"""
    invoice_number: str
    date: str
    vendor: str
    total_amount: float
    currency: str
    line_items: list[dict]
    notes: Optional[str] = None

# Create an object extractor
extractor = warehouse.create_object_extractor(
    name="Invoice Extractor",
    brief="Extract invoice data from documents",
    extractable_object=Invoice,  # Your Pydantic model
    extraction_prompt="Extract all invoice information from the document...",
    llm_model="gpt-4o",
    compute_alerts=True,
    block_grouping_config={"type": "page_window", "window_size_pages": 5},
    add_anchoring_object=True
)

print(f"Created extractor: {extractor['id']}")

Update Object Extractor

# Update an existing object extractor
updated = warehouse.update_object_extractor(
    object_extractor_id=extractor['id'],
    name="Updated Invoice Extractor",
    extraction_prompt="Updated extraction prompt...",
    llm_model="claude-3-5-sonnet-20241022",
    compute_alerts=False
)

Run Object Extractor

# Run extraction on all documents
task = warehouse.run_object_extractor(
    object_extractor_id=extractor['id']
)
result = task.wait_for_completion()

# Run on specific documents
task = warehouse.run_object_extractor(
    object_extractor_id=extractor['id'],
    document_ids=["doc-id-1", "doc-id-2"]
)

Delete Object Extractor

# Delete an object extractor
warehouse.delete_object_extractor(object_extractor_id=extractor['id'])