Reducto Ruby SDK
An open-source Ruby library for the Reducto REST API. Provides convenient access to document processing features with both synchronous and asynchronous support.
Installation
Add this line to your application's Gemfile:
gem 'reducto'
And then execute:
bundle install
Or install it yourself as:
gem install reducto
Quick Start
The library needs to be configured with your account's API key, which is available in your Reducto Dashboard.
Set REDUCTO_API_KEY
in your environment:
export REDUCTO_API_KEY='your-api-key-here'
Basic Usage
require 'reducto'
# Initialize the client
client = Reducto.new(api_key: ENV['REDUCTO_API_KEY'])
# Parse a document
response = client.parse.run(input: 'https://example.com/document.pdf')
puts response.job_id
puts response.usage.pages
Usage Examples
Upload and Parse Local Files
require 'reducto'
client = Reducto.new
# Upload a local file
upload = client.upload(file: File.open('sample.pdf'))
# Parse the uploaded file
result = client.parse.run(input: upload.url)
puts result
Structured Data Extraction
Extract structured data from documents using JSON schemas:
# Define extraction schema
schema = {
type: 'object',
properties: {
customer_name: {
type: 'string',
description: 'The full name of the customer'
},
accounts: {
type: 'array',
description: 'List of financial accounts',
items: {
type: 'object',
properties: {
account_type: { type: 'string' },
account_number: { type: 'string' },
ending_value: { type: 'number' }
},
required: ['account_type', 'account_number', 'ending_value']
}
}
},
required: ['customer_name', 'accounts']
}
# Extract data with citations
result = client.extract.run(
input: upload.url,
instructions: {
schema: schema,
system_prompt: 'Be precise and thorough.'
},
settings: {
citations: {
enabled: true
}
}
)
# Access extracted data
puts "Customer: #{result['result']['customer_name']['value']}"
result['result']['accounts']['value'].each do |account|
puts " #{account['account_type']['value']}: $#{account['ending_value']['value']}"
end
Document Editing and Form Filling
# Upload a document
upload = client.upload(file: File.open('contract.docx'))
# Fill in form fields
result = client.edit.run(
document_url: upload.url,
edit_instructions: "Fill in the client name as 'Acme Corporation' and set the contract date to January 15, 2024",
edit_options: {
color: '#0066CC'
}
)
# Download the edited document
edited_url = result['document_url']
Async Processing with Job Polling
For large documents, use async processing:
# Submit async parse job
job_response = client.parse.run_job(
input: 'https://example.com/large-document.pdf',
enhance: {
summarize_figures: true
}
)
job_id = job_response['job_id']
puts "Job ID: #{job_id}"
# Poll for completion
loop do
job = client.job.get(job_id)
status = job['status']
break if status == 'Completed' || status == 'Failed'
puts "Status: #{status}"
sleep 2
end
# Get final result
final_job = client.job.get(job_id)
if final_job['status'] == 'Completed'
result = final_job['result']
puts "Processing complete!"
else
puts "Job failed: #{final_job['error']}"
end
Multi-Step Workflows
Chain operations efficiently by reusing parsed results:
# Step 1: Parse the document
parse_response = client.parse.run(input: document_url)
job_id = parse_response.job_id
# Step 2: Classify document type (reuses parsed data)
classification = client.extract.run(
input: "jobid://#{job_id}", # Reference the parsed job
instructions: {
schema: {
type: 'object',
properties: {
document_type: {
type: 'string',
enum: ['W2', 'Passport', 'Invoice', 'Other']
}
}
}
}
)
document_type = classification['result']['document_type']['value']
# Step 3: Extract with type-specific schema
schema = case document_type
when 'W2'
{ type: 'object', properties: { total_wages: { type: 'number' }, ... } }
when 'Invoice'
{ type: 'object', properties: { total_amount: { type: 'number' }, ... } }
end
# Extract using the same parsed job (saves processing time and credits)
extract_response = client.extract.run(
input: "jobid://#{job_id}",
instructions: { schema: schema }
)
Webhooks for Async Notifications
Configure webhooks to receive notifications when jobs complete:
# Step 1: Configure webhook (one-time setup)
webhook_config = client.webhook.run
puts "Configure webhook at: #{webhook_config['url']}"
# Step 2: Submit jobs with webhook
result = client.parse.run_job(
input: 'https://example.com/document.pdf',
async: {
webhook: {
mode: 'svix', # Managed webhook delivery
channels: [] # Or specify specific channels
}
}
)
# Direct webhook (simpler, no Svix required)
result = client.parse.run_job(
input: 'https://example.com/document.pdf',
async: {
webhook: {
mode: 'direct',
url: 'https://your-app.com/webhooks/reducto'
}
}
)
Batch Processing with AsyncClient
Process multiple documents concurrently (requires Ruby >= 3.1):
require 'reducto/async_client'
require 'async'
client = Reducto::AsyncClient.new
# Process documents concurrently
documents = Dir.glob('docs/**/*.pdf')[0...100]
max_concurrency = 50
Async do |task|
semaphore = Async::Semaphore.new(max_concurrency)
results = documents.map do |path|
task.async do
semaphore.async do
# Upload and parse
upload = client.upload(file: File.open(path)).wait
result = client.parse.run(input: upload.url).wait
# Save result
output_path = path.sub('.pdf', '.reducto.json')
File.write(output_path, result.to_json)
{ path: path, chunks: result.result.chunks.length }
end
end
end
completed = results.map(&:wait)
puts "Processed #{completed.length} documents"
end
client.close
Configuration
Environment Selection
# Production (default)
client = Reducto.new(environment: 'production')
# EU
client = Reducto.new(environment: 'eu')
# Australia
client = Reducto.new(environment: 'au')
# Custom base URL
client = Reducto.new(base_url: 'https://custom.reducto.ai')
Timeout Configuration
# Default timeout is 1 hour (3600 seconds)
client = Reducto.new(timeout: 120) # 2 minutes
Retry Configuration
# Default max retries is 2
client = Reducto.new(max_retries: 5)
Advanced Features
Raw Response Access
Access raw HTTP responses including headers:
response = client.parse.run(
input: 'https://example.com/document.pdf',
raw_response: true
)
puts response.status
puts response.headers
parsed_data = response.parse # Get the parsed object
Streaming Responses
Stream response data for server-sent events:
stream = client.parse.run(
input: 'https://example.com/document.pdf',
streaming: true
)
stream.each do |event|
puts "Event: #{event.data}"
end
stream.close
Using Types and Response Models
Response objects are typed using BaseModel (similar to Pydantic):
response = client.parse.run(input: 'document.pdf')
# Access typed fields
response.job_id # String
response.duration # Float
response.usage.pages # Integer
response.result.chunks # Array
# Serialize to JSON
response.to_json
# Convert to hash
response.to_hash
# Check which fields are set
response.model_fields_set # Returns Set of field names
# Access extra/undocumented fields
response.model_extra # Returns Hash of extra fields
Distinguishing nil vs missing fields
if response.pdf_url.nil?
if response.model_fields_set.include?(:pdf_url)
puts 'pdf_url was explicitly null'
else
puts 'pdf_url was not present in the response'
end
end
Error Handling
The library raises exceptions for API errors:
begin
response = client.parse.run(input: 'invalid-url')
rescue Reducto::AuthenticationError => e
puts "Authentication failed: #{e.message}"
rescue Reducto::RateLimitError => e
puts "Rate limit exceeded: #{e.message}"
rescue Reducto::BadRequestError => e
puts "Bad request: #{e.message}"
rescue Reducto::APIConnectionError => e
puts "Connection error: #{e.message}"
rescue Reducto::APITimeoutError => e
puts "Request timed out: #{e.message}"
rescue Reducto::APIStatusError => e
puts "API error (#{e.status_code}): #{e.message}"
end
Exception Hierarchy
-
Reducto::Error
- Base error class-
Reducto::APIConnectionError
- Network/connection errors-
Reducto::APITimeoutError
- Timeout errors
-
-
Reducto::APIStatusError
- HTTP status errors-
Reducto::BadRequestError
(400) -
Reducto::AuthenticationError
(401) -
Reducto::PermissionDeniedError
(403) -
Reducto::NotFoundError
(404) -
Reducto::ConflictError
(409) -
Reducto::UnprocessableEntityError
(422) -
Reducto::RateLimitError
(429) -
Reducto::InternalServerError
(5xx)
-
-
API Resources
Parse
Parse documents into structured chunks:
# Synchronous parsing
response = client.parse.run(
input: 'https://example.com/document.pdf',
enhance: {
summarize_figures: true
},
formatting: {
markdown: true
},
retrieval: {
embedding_optimized: true
}
)
# Asynchronous parsing
job = client.parse.run_job(input: document_url, async: { webhook: {...} })
Extract
Extract structured data with schemas:
result = client.extract.run(
input: document_url,
instructions: {
schema: {...},
system_prompt: 'Be precise'
},
settings: {
citations: { enabled: true }
}
)
Edit
Fill forms and edit documents:
result = client.edit.run(
document_url: document_url,
edit_instructions: 'Fill in the form fields',
edit_options: { color: '#0000FF' }
)
Split
Split documents into sections:
result = client.split.run(
input: document_url,
split_description: [
{ name: 'section1', description: 'First section' }
],
split_rules: 'Split by major sections'
)
Pipeline
Run custom pipelines:
result = client.pipeline.run(
input: document_url,
pipeline_id: 'your-pipeline-id'
)
Job Management
# Get job status
job = client.job.get(job_id)
# List all jobs
jobs = client.job.get_all(limit: 10, cursor: 'next_page_token')
# Cancel a job
client.job.cancel(job_id)
Webhook
# Configure webhook
webhook = client.webhook.run
Utility Methods
# Upload a file
upload = client.upload(file: File.open('document.pdf'), extension: 'pdf')
# Get API version
version = client.api_version
Examples
See the examples/
directory for complete working examples:
- basic_usage.rb - Basic parsing and uploading
- structured_extraction.rb - JSON schema extraction
- document_editing.rb - Form filling and editing
- multi_step_workflow.rb - Chained operations
- job_polling.rb - Async job polling patterns
- webhook_setup.rb - Webhook configuration
- batch_processing.rb - Concurrent processing
- async_usage.rb - AsyncClient examples
- response_wrappers.rb - Raw and streaming responses
Requirements
- Ruby >= 2.7.0
- For async functionality (AsyncClient): Ruby >= 3.1.0 and
async
,async-http
gems
Development
After checking out the repo, run bundle install
to install dependencies. Then, run bundle exec rspec
to run the tests.
To install this gem onto your local machine, run bundle exec rake install
.
Contributing
Bug reports and pull requests are welcome on GitHub at https://github.com/databodega-io/reducto-ruby-sdk.
License
The gem is available as open source under the terms of the Apache-2.0 License.
Acknowledgments
This Ruby SDK is inspired by the official Reducto Python SDK.