Advanced Usage Examples
This page presents advanced examples of using the ParseMyFile API for complex use cases.
Example 1: Table Data Extraction
YAML Configuration for Table
yaml
# billing-table.yaml
schemas:
data:
type: object
properties:
table_header:
type: string
description: table header
product_rows:
type: array<object>
description: product rows in the table
items:
type: object
properties:
product:
type: string
description: product name
quantity:
type: integer
description: product quantity
unit_price:
type: double
description: unit price in euros
total:
type: double
description: line total in euros
grand_total:
type: double
description: invoice grand total in eurosPython Code for Table Processing
python
import requests
import pandas as pd
from typing import List, Dict, Any
def process_table_document(file_path: str, yaml_path: str, api_key: str) -> pd.DataFrame:
"""
Process a document containing a table and return a DataFrame
"""
url = "https://api.parsemyfile.com/api/v1/generate"
headers = {"X-API-KEY": api_key}
with open(file_path, 'rb') as file, open(yaml_path, 'rb') as yaml_file:
files = {
'file': (file_path, file, 'application/pdf'),
'yaml_file': ('config.yaml', yaml_file, 'text/yaml')
}
response = requests.post(url, headers=headers, files=files)
if response.status_code == 200:
result = response.json()
data = result['data']['extracted_fields']
# Convert table rows to DataFrame
if 'product_rows' in data:
df = pd.DataFrame(data['product_rows'])
return df
else:
return pd.DataFrame()
else:
raise Exception(f"API Error: {response.text}")
# Usage
try:
df = process_table_document('invoice_with_table.pdf', 'billing-table.yaml', 'your_api_key')
print("Extracted table:")
print(df)
# Calculate statistics
print(f"\nNumber of rows: {len(df)}")
print(f"Grand total: {df['total'].sum():.2f} EUR")
except Exception as e:
print(f"Error: {e}")Example 2: Multi-page Document Extraction
YAML Configuration for Multi-page Document
yaml
# multipage-document.yaml
schemas:
data:
type: object
properties:
document_title:
type: string
description: document title
executive_summary:
type: string
description: document executive summary
chapters:
type: array<object>
description: document chapters
items:
type: object
properties:
chapter_title:
type: string
description: chapter title
chapter_content:
type: string
description: chapter content
conclusion:
type: string
description: document conclusion
signatures:
type: array<string>
description: detected signaturesJavaScript Code for Multi-page Processing
javascript
class MultiPageProcessor {
constructor(apiKey) {
this.apiKey = apiKey;
this.baseUrl = 'https://api.parsemyfile.com/api/v1/generate';
}
async processMultiPageDocument(file, yamlFile) {
try {
const formData = new FormData();
formData.append('file', file);
formData.append('yaml_file', yamlFile);
const response = await fetch(this.baseUrl, {
method: 'POST',
headers: {
'X-API-KEY': this.apiKey
},
body: formData
});
if (!response.ok) {
throw new Error(`Error ${response.status}: ${response.statusText}`);
}
const result = await response.json();
return this.organizeMultiPageData(result.data);
} catch (error) {
console.error('Error during multi-page processing:', error);
throw error;
}
}
organizeMultiPageData(data) {
const organized = {
metadata: data.metadata,
content: {
title: data.extracted_fields.document_title,
executiveSummary: data.extracted_fields.executive_summary,
chapters: this.organizeChapters(data.extracted_fields.chapters),
conclusion: data.extracted_fields.conclusion,
signatures: data.extracted_fields.signatures
}
};
return organized;
}
organizeChapters(chaptersData) {
if (!chaptersData || !Array.isArray(chaptersData)) {
return [];
}
return chaptersData.map((chapter, index) => ({
id: index + 1,
title: chapter.chapter_title,
content: chapter.chapter_content
}));
}
}
// Usage
const processor = new MultiPageProcessor('your_api_key');
document.getElementById('processBtn').addEventListener('click', async () => {
const fileInput = document.getElementById('fileInput');
const yamlInput = document.getElementById('yamlInput');
if (fileInput.files.length === 0 || yamlInput.files.length === 0) {
alert('Please select a file and YAML configuration');
return;
}
try {
const result = await processor.processMultiPageDocument(
fileInput.files[0],
yamlInput.files[0]
);
console.log('Multi-page document processed:', result);
displayResults(result);
} catch (error) {
console.error('Error:', error);
alert('Error during processing: ' + error.message);
}
});
function displayResults(result) {
const output = document.getElementById('output');
output.innerHTML = `
<h3>Title: ${result.content.title}</h3>
<h4>Executive Summary:</h4>
<p>${result.content.executiveSummary}</p>
<h4>Chapters (${result.content.chapters.length}):</h4>
${result.content.chapters.map(ch => `
<div>
<strong>Chapter ${ch.id}: ${ch.title}</strong>
<p>${ch.content}</p>
</div>
`).join('')}
<h4>Conclusion:</h4>
<p>${result.content.conclusion}</p>
`;
}Example 3: Extraction with Advanced Validation
YAML Configuration with Validation
yaml
# advanced-validation.yaml
schemas:
data:
type: object
properties:
order_number:
type: string
description: order number (format CMD-XXXXXX)
client_email:
type: string
description: client email
order_amount:
type: double
description: order amount in euros
delivery_date:
type: string
description: expected delivery date (format YYYY-MM-DD)
products:
type: array<object>
description: ordered products list
items:
type: object
properties:
product_name:
type: string
description: product name
quantity:
type: integer
description: product quantity
price:
type: double
description: unit price in eurosPython Code with Validation
python
import requests
import re
from datetime import datetime, timedelta
from typing import Dict, List, Any, Optional
from dataclasses import dataclass
@dataclass
class ValidationError:
field: str
message: str
value: Any
class AdvancedValidator:
def __init__(self):
self.errors: List[ValidationError] = []
def validate_order_number(self, value: str) -> bool:
pattern = r"^CMD-[0-9]{6}$"
if not re.match(pattern, value):
self.errors.append(ValidationError(
"order_number",
"Order number must be in format CMD-XXXXXX",
value
))
return False
return True
def validate_email(self, value: str) -> bool:
pattern = r"^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$"
if not re.match(pattern, value):
self.errors.append(ValidationError(
"client_email",
"Invalid email format",
value
))
return False
return True
def validate_delivery_date(self, value: str) -> bool:
try:
date = datetime.strptime(value, "%d/%m/%Y")
today = datetime.now()
max_date = today + timedelta(days=365)
if date < today:
self.errors.append(ValidationError(
"delivery_date",
"Delivery date cannot be in the past",
value
))
return False
if date > max_date:
self.errors.append(ValidationError(
"delivery_date",
"Delivery date cannot exceed one year",
value
))
return False
return True
except ValueError:
self.errors.append(ValidationError(
"delivery_date",
"Invalid date format (DD/MM/YYYY expected)",
value
))
return False
def validate_amount(self, value: float) -> bool:
if value < 0 or value > 10000:
self.errors.append(ValidationError(
"order_amount",
"Amount must be between 0 and 10000 EUR",
value
))
return False
return True
def validate_products(self, products: List[Dict]) -> bool:
if len(products) < 1 or len(products) > 10:
self.errors.append(ValidationError(
"products",
"Number of products must be between 1 and 10",
len(products)
))
return False
for i, product in enumerate(products):
if not product.get('product_name'):
self.errors.append(ValidationError(
f"products[{i}].product_name",
"Product name is required",
product
))
return False
if not (1 <= product.get('quantity', 0) <= 100):
self.errors.append(ValidationError(
f"products[{i}].quantity",
"Quantity must be between 1 and 100",
product.get('quantity')
))
return False
return True
def process_with_validation(file_path: str, yaml_path: str, api_key: str) -> Dict[str, Any]:
"""
Process a document with advanced validation
"""
# API call
url = "https://api.parsemyfile.com/api/v1/generate"
headers = {"X-API-KEY": api_key}
with open(file_path, 'rb') as file, open(yaml_path, 'rb') as yaml_file:
files = {
'file': (file_path, file, 'application/pdf'),
'yaml_file': ('config.yaml', yaml_file, 'text/yaml')
}
response = requests.post(url, headers=headers, files=files)
if response.status_code != 200:
raise Exception(f"API Error: {response.text}")
result = response.json()
data = result['data']['extracted_fields']
# Validation
validator = AdvancedValidator()
# Individual field validation
validator.validate_order_number(data.get('order_number', ''))
validator.validate_email(data.get('client_email', ''))
validator.validate_delivery_date(data.get('delivery_date', ''))
validator.validate_amount(float(data.get('order_amount', 0)))
validator.validate_products(data.get('products', []))
return {
'data': data,
'valid': len(validator.errors) == 0,
'errors': validator.errors,
'metadata': result['data']['metadata']
}
# Usage
try:
result = process_with_validation('order.pdf', 'advanced-validation.yaml', 'your_api_key')
if result['valid']:
print("✅ Document processed and validated successfully")
print(f"Order number: {result['data']['order_number']}")
print(f"Client: {result['data']['client_email']}")
print(f"Amount: {result['data']['order_amount']} EUR")
else:
print("❌ Validation errors detected:")
for error in result['errors']:
print(f" - {error.field}: {error.message} (value: {error.value})")
except Exception as e:
print(f"Error: {e}")Example 4: Database Integration
Python Code with Database Integration
python
import sqlite3
import requests
import json
from typing import Dict, Any
from datetime import datetime
class DocumentProcessor:
def __init__(self, api_key: str, db_path: str = "documents.db"):
self.api_key = api_key
self.db_path = db_path
self.init_database()
def init_database(self):
"""Initialize the database"""
conn = sqlite3.connect(self.db_path)
cursor = conn.cursor()
cursor.execute('''
CREATE TABLE IF NOT EXISTS documents (
id INTEGER PRIMARY KEY AUTOINCREMENT,
filename TEXT NOT NULL,
document_type TEXT NOT NULL,
processed_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
status TEXT NOT NULL,
confidence_score REAL,
extracted_data TEXT,
error_message TEXT
)
''')
cursor.execute('''
CREATE TABLE IF NOT EXISTS extracted_fields (
id INTEGER PRIMARY KEY AUTOINCREMENT,
document_id INTEGER,
field_name TEXT NOT NULL,
field_value TEXT,
field_type TEXT,
confidence REAL,
FOREIGN KEY (document_id) REFERENCES documents (id)
)
''')
conn.commit()
conn.close()
def process_document(self, file_path: str, yaml_path: str, document_type: str) -> Dict[str, Any]:
"""Process a document and save to database"""
conn = sqlite3.connect(self.db_path)
cursor = conn.cursor()
try:
# API call
result = self.call_api(file_path, yaml_path)
if result['status'] == 'success':
# Save document
cursor.execute('''
INSERT INTO documents (filename, document_type, status, confidence_score, extracted_data)
VALUES (?, ?, ?, ?, ?)
''', (
file_path,
document_type,
'success',
result['data']['metadata']['confidence_score'],
json.dumps(result['data']['extracted_fields'])
))
document_id = cursor.lastrowid
# Save extracted fields
for field_name, field_value in result['data']['extracted_fields'].items():
cursor.execute('''
INSERT INTO extracted_fields (document_id, field_name, field_value, field_type, confidence)
VALUES (?, ?, ?, ?, ?)
''', (
document_id,
field_name,
str(field_value),
'text', # Default type
result['data']['metadata']['confidence_score']
))
conn.commit()
return {
'success': True,
'document_id': document_id,
'data': result['data']
}
else:
# Save error
cursor.execute('''
INSERT INTO documents (filename, document_type, status, error_message)
VALUES (?, ?, ?, ?)
''', (file_path, document_type, 'error', str(result.get('error', 'Unknown error'))))
conn.commit()
return {
'success': False,
'error': result.get('error', 'Unknown error')
}
except Exception as e:
# Save exception
cursor.execute('''
INSERT INTO documents (filename, document_type, status, error_message)
VALUES (?, ?, ?, ?)
''', (file_path, document_type, 'error', str(e)))
conn.commit()
return {
'success': False,
'error': str(e)
}
finally:
conn.close()
def call_api(self, file_path: str, yaml_path: str) -> Dict[str, Any]:
"""Call to ParseMyFile API"""
url = "https://api.parsemyfile.com/api/v1/generate"
headers = {"X-API-KEY": self.api_key}
with open(file_path, 'rb') as file, open(yaml_path, 'rb') as yaml_file:
files = {
'file': (file_path, file, 'application/pdf'),
'yaml_file': ('config.yaml', yaml_file, 'text/yaml')
}
response = requests.post(url, headers=headers, files=files)
if response.status_code == 200:
return response.json()
else:
return {
'status': 'error',
'error': f"API Error: {response.status_code} - {response.text}"
}
def get_document_stats(self) -> Dict[str, Any]:
"""Get processed documents statistics"""
conn = sqlite3.connect(self.db_path)
cursor = conn.cursor()
# General statistics
cursor.execute('''
SELECT
COUNT(*) as total,
SUM(CASE WHEN status = 'success' THEN 1 ELSE 0 END) as success,
SUM(CASE WHEN status = 'error' THEN 1 ELSE 0 END) as errors,
AVG(confidence_score) as avg_confidence
FROM documents
''')
stats = cursor.fetchone()
# Statistics by document type
cursor.execute('''
SELECT document_type, COUNT(*) as count
FROM documents
GROUP BY document_type
''')
by_type = dict(cursor.fetchall())
conn.close()
return {
'total_documents': stats[0],
'successful': stats[1],
'errors': stats[2],
'average_confidence': stats[3],
'by_type': by_type
}
def search_documents(self, field_name: str, field_value: str) -> List[Dict[str, Any]]:
"""Search documents by field value"""
conn = sqlite3.connect(self.db_path)
cursor = conn.cursor()
cursor.execute('''
SELECT d.*, ef.field_value
FROM documents d
JOIN extracted_fields ef ON d.id = ef.document_id
WHERE ef.field_name = ? AND ef.field_value LIKE ?
''', (field_name, f"%{field_value}%"))
results = []
for row in cursor.fetchall():
results.append({
'id': row[0],
'filename': row[1],
'document_type': row[2],
'processed_at': row[3],
'status': row[4],
'confidence_score': row[5],
'field_value': row[7]
})
conn.close()
return results
# Usage
processor = DocumentProcessor('your_api_key')
# Process a document
result = processor.process_document('invoice.pdf', 'invoice.yaml', 'invoice')
if result['success']:
print(f"✅ Document processed successfully (ID: {result['document_id']})")
else:
print(f"❌ Error: {result['error']}")
# Statistics
stats = processor.get_document_stats()
print(f"Statistics: {stats['successful']}/{stats['total_documents']} documents processed successfully")
# Search
results = processor.search_documents('client_email', 'john@example.com')
print(f"Documents found: {len(results)}")Example 5: Custom REST API
Flask Code for REST API
python
from flask import Flask, request, jsonify
import requests
import os
from werkzeug.utils import secure_filename
import json
app = Flask(__name__)
app.config['MAX_CONTENT_LENGTH'] = 10 * 1024 * 1024 # 10 MB max
# Configuration
PARSE_MY_FILE_API_KEY = os.getenv('PARSE_MY_FILE_API_KEY')
PARSE_MY_FILE_BASE_URL = 'https://api.parsemyfile.com'
@app.route('/api/process', methods=['POST'])
def process_document():
"""Endpoint to process a document"""
try:
# Check files
if 'file' not in request.files or 'yaml_file' not in request.files:
return jsonify({'error': 'Missing files'}), 400
file = request.files['file']
yaml_file = request.files['yaml_file']
if file.filename == '' or yaml_file.filename == '':
return jsonify({'error': 'No file selected'}), 400
# Call to ParseMyFile API
files = {
'file': (secure_filename(file.filename), file, file.content_type),
'yaml_file': (secure_filename(yaml_file.filename), yaml_file, 'text/yaml')
}
headers = {'X-API-KEY': PARSE_MY_FILE_API_KEY}
response = requests.post(
f'{PARSE_MY_FILE_BASE_URL}/api/v1/generate',
headers=headers,
files=files
)
if response.status_code == 200:
result = response.json()
return jsonify({
'success': True,
'data': result['data'],
'processing_info': result.get('processing_info', {})
})
else:
return jsonify({
'success': False,
'error': f"API Error: {response.status_code}",
'details': response.text
}), response.status_code
except Exception as e:
return jsonify({
'success': False,
'error': str(e)
}), 500
@app.route('/api/health', methods=['GET'])
def health_check():
"""API health check"""
try:
response = requests.get(f'{PARSE_MY_FILE_BASE_URL}/health')
if response.status_code == 200:
return jsonify({
'status': 'healthy',
'parse_my_file_api': response.json()
})
else:
return jsonify({
'status': 'unhealthy',
'parse_my_file_api': 'unavailable'
}), 503
except Exception as e:
return jsonify({
'status': 'unhealthy',
'error': str(e)
}), 503
@app.route('/api/templates', methods=['GET'])
def get_templates():
"""Get available configuration templates"""
templates = {
'invoice': {
'name': 'Invoice',
'description': 'Configuration for processing invoices',
'yaml': '''
fields:
- name: "invoice_number"
type: "text"
position: "top-right"
required: true
- name: "total_amount"
type: "currency"
position: "bottom-right"
currency: "EUR"
'''
},
'contract': {
'name': 'Contract',
'description': 'Configuration for processing contracts',
'yaml': '''
fields:
- name: "contract_title"
type: "text"
position: "top-center"
- name: "contracting_parties"
type: "text"
position: "center"
multiline: true
'''
}
}
return jsonify(templates)
if __name__ == '__main__':
app.run(debug=True, host='0.0.0.0', port=5000)HTML Code for User Interface
html
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>ParseMyFile - Document Processing Interface</title>
<style>
body { font-family: Arial, sans-serif; margin: 40px; }
.container { max-width: 800px; margin: 0 auto; }
.form-group { margin-bottom: 20px; }
label { display: block; margin-bottom: 5px; font-weight: bold; }
input[type="file"] { width: 100%; padding: 10px; }
button { background: #007bff; color: white; padding: 10px 20px; border: none; cursor: pointer; }
button:hover { background: #0056b3; }
.result { margin-top: 20px; padding: 20px; background: #f8f9fa; border-radius: 5px; }
.error { background: #f8d7da; color: #721c24; }
.success { background: #d4edda; color: #155724; }
</style>
</head>
<body>
<div class="container">
<h1>ParseMyFile - Document Processing</h1>
<form id="processForm" enctype="multipart/form-data">
<div class="form-group">
<label for="file">Document to process:</label>
<input type="file" id="file" name="file" accept=".pdf,.jpg,.jpeg,.png,.tiff,.bmp" required>
</div>
<div class="form-group">
<label for="yaml_file">YAML Configuration:</label>
<input type="file" id="yaml_file" name="yaml_file" accept=".yaml,.yml" required>
</div>
<button type="submit">Process Document</button>
</form>
<div id="result" class="result" style="display: none;"></div>
</div>
<script>
document.getElementById('processForm').addEventListener('submit', async (e) => {
e.preventDefault();
const formData = new FormData();
const fileInput = document.getElementById('file');
const yamlInput = document.getElementById('yaml_file');
const resultDiv = document.getElementById('result');
if (fileInput.files.length === 0 || yamlInput.files.length === 0) {
showResult('Please select both files', 'error');
return;
}
formData.append('file', fileInput.files[0]);
formData.append('yaml_file', yamlInput.files[0]);
try {
showResult('Processing...', 'info');
const response = await fetch('/api/process', {
method: 'POST',
body: formData
});
const result = await response.json();
if (result.success) {
showResult(`
<h3>✅ Document processed successfully</h3>
<h4>Extracted data:</h4>
<pre>${JSON.stringify(result.data.extracted_fields, null, 2)}</pre>
<h4>Metadata:</h4>
<pre>${JSON.stringify(result.data.metadata, null, 2)}</pre>
`, 'success');
} else {
showResult(`❌ Error: ${result.error}`, 'error');
}
} catch (error) {
showResult(`❌ Error: ${error.message}`, 'error');
}
});
function showResult(message, type) {
const resultDiv = document.getElementById('result');
resultDiv.innerHTML = message;
resultDiv.className = `result ${type}`;
resultDiv.style.display = 'block';
}
</script>
</body>
</html>These advanced examples show how to use the ParseMyFile API in complex and professional contexts, with validation, database integration, and custom REST API creation.