Skip to content

Advanced Usage Examples

This page presents advanced examples of using the ParseMyFile API for complex use cases.

Example 1: Table Data Extraction

YAML Configuration for Table

yaml
# billing-table.yaml
schemas:
  data:
    type: object
    properties:
      table_header:
        type: string
        description: table header
      product_rows:
        type: array<object>
        description: product rows in the table
        items:
          type: object
          properties:
            product:
              type: string
              description: product name
            quantity:
              type: integer
              description: product quantity
            unit_price:
              type: double
              description: unit price in euros
            total:
              type: double
              description: line total in euros
      grand_total:
        type: double
        description: invoice grand total in euros

Python Code for Table Processing

python
import requests
import pandas as pd
from typing import List, Dict, Any

def process_table_document(file_path: str, yaml_path: str, api_key: str) -> pd.DataFrame:
    """
    Process a document containing a table and return a DataFrame
    """
    url = "https://api.parsemyfile.com/api/v1/generate"
    headers = {"X-API-KEY": api_key}
    
    with open(file_path, 'rb') as file, open(yaml_path, 'rb') as yaml_file:
        files = {
            'file': (file_path, file, 'application/pdf'),
            'yaml_file': ('config.yaml', yaml_file, 'text/yaml')
        }
        
        response = requests.post(url, headers=headers, files=files)
        
        if response.status_code == 200:
            result = response.json()
            data = result['data']['extracted_fields']
            
            # Convert table rows to DataFrame
            if 'product_rows' in data:
                df = pd.DataFrame(data['product_rows'])
                return df
            else:
                return pd.DataFrame()
        else:
            raise Exception(f"API Error: {response.text}")

# Usage
try:
    df = process_table_document('invoice_with_table.pdf', 'billing-table.yaml', 'your_api_key')
    print("Extracted table:")
    print(df)
    
    # Calculate statistics
    print(f"\nNumber of rows: {len(df)}")
    print(f"Grand total: {df['total'].sum():.2f} EUR")
    
except Exception as e:
    print(f"Error: {e}")

Example 2: Multi-page Document Extraction

YAML Configuration for Multi-page Document

yaml
# multipage-document.yaml
schemas:
  data:
    type: object
    properties:
      document_title:
        type: string
        description: document title
      executive_summary:
        type: string
        description: document executive summary
      chapters:
        type: array<object>
        description: document chapters
        items:
          type: object
          properties:
            chapter_title:
              type: string
              description: chapter title
            chapter_content:
              type: string
              description: chapter content
      conclusion:
        type: string
        description: document conclusion
      signatures:
        type: array<string>
        description: detected signatures

JavaScript Code for Multi-page Processing

javascript
class MultiPageProcessor {
  constructor(apiKey) {
    this.apiKey = apiKey;
    this.baseUrl = 'https://api.parsemyfile.com/api/v1/generate';
  }

  async processMultiPageDocument(file, yamlFile) {
    try {
      const formData = new FormData();
      formData.append('file', file);
      formData.append('yaml_file', yamlFile);

      const response = await fetch(this.baseUrl, {
        method: 'POST',
        headers: {
          'X-API-KEY': this.apiKey
        },
        body: formData
      });

      if (!response.ok) {
        throw new Error(`Error ${response.status}: ${response.statusText}`);
      }

      const result = await response.json();
      return this.organizeMultiPageData(result.data);
    } catch (error) {
      console.error('Error during multi-page processing:', error);
      throw error;
    }
  }

  organizeMultiPageData(data) {
    const organized = {
      metadata: data.metadata,
      content: {
        title: data.extracted_fields.document_title,
        executiveSummary: data.extracted_fields.executive_summary,
        chapters: this.organizeChapters(data.extracted_fields.chapters),
        conclusion: data.extracted_fields.conclusion,
        signatures: data.extracted_fields.signatures
      }
    };

    return organized;
  }

  organizeChapters(chaptersData) {
    if (!chaptersData || !Array.isArray(chaptersData)) {
      return [];
    }

    return chaptersData.map((chapter, index) => ({
      id: index + 1,
      title: chapter.chapter_title,
      content: chapter.chapter_content
    }));
  }
}

// Usage
const processor = new MultiPageProcessor('your_api_key');

document.getElementById('processBtn').addEventListener('click', async () => {
  const fileInput = document.getElementById('fileInput');
  const yamlInput = document.getElementById('yamlInput');
  
  if (fileInput.files.length === 0 || yamlInput.files.length === 0) {
    alert('Please select a file and YAML configuration');
    return;
  }

  try {
    const result = await processor.processMultiPageDocument(
      fileInput.files[0],
      yamlInput.files[0]
    );
    
    console.log('Multi-page document processed:', result);
    displayResults(result);
  } catch (error) {
    console.error('Error:', error);
    alert('Error during processing: ' + error.message);
  }
});

function displayResults(result) {
  const output = document.getElementById('output');
  output.innerHTML = `
    <h3>Title: ${result.content.title}</h3>
    <h4>Executive Summary:</h4>
    <p>${result.content.executiveSummary}</p>
    <h4>Chapters (${result.content.chapters.length}):</h4>
    ${result.content.chapters.map(ch => `
      <div>
        <strong>Chapter ${ch.id}: ${ch.title}</strong>
        <p>${ch.content}</p>
      </div>
    `).join('')}
    <h4>Conclusion:</h4>
    <p>${result.content.conclusion}</p>
  `;
}

Example 3: Extraction with Advanced Validation

YAML Configuration with Validation

yaml
# advanced-validation.yaml
schemas:
  data:
    type: object
    properties:
      order_number:
        type: string
        description: order number (format CMD-XXXXXX)
      client_email:
        type: string
        description: client email
      order_amount:
        type: double
        description: order amount in euros
      delivery_date:
        type: string
        description: expected delivery date (format YYYY-MM-DD)
      products:
        type: array<object>
        description: ordered products list
        items:
          type: object
          properties:
            product_name:
              type: string
              description: product name
            quantity:
              type: integer
              description: product quantity
            price:
              type: double
              description: unit price in euros

Python Code with Validation

python
import requests
import re
from datetime import datetime, timedelta
from typing import Dict, List, Any, Optional
from dataclasses import dataclass

@dataclass
class ValidationError:
    field: str
    message: str
    value: Any

class AdvancedValidator:
    def __init__(self):
        self.errors: List[ValidationError] = []
    
    def validate_order_number(self, value: str) -> bool:
        pattern = r"^CMD-[0-9]{6}$"
        if not re.match(pattern, value):
            self.errors.append(ValidationError(
                "order_number",
                "Order number must be in format CMD-XXXXXX",
                value
            ))
            return False
        return True
    
    def validate_email(self, value: str) -> bool:
        pattern = r"^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$"
        if not re.match(pattern, value):
            self.errors.append(ValidationError(
                "client_email",
                "Invalid email format",
                value
            ))
            return False
        return True
    
    def validate_delivery_date(self, value: str) -> bool:
        try:
            date = datetime.strptime(value, "%d/%m/%Y")
            today = datetime.now()
            max_date = today + timedelta(days=365)
            
            if date < today:
                self.errors.append(ValidationError(
                    "delivery_date",
                    "Delivery date cannot be in the past",
                    value
                ))
                return False
            
            if date > max_date:
                self.errors.append(ValidationError(
                    "delivery_date",
                    "Delivery date cannot exceed one year",
                    value
                ))
                return False
                
            return True
        except ValueError:
            self.errors.append(ValidationError(
                "delivery_date",
                "Invalid date format (DD/MM/YYYY expected)",
                value
            ))
            return False
    
    def validate_amount(self, value: float) -> bool:
        if value < 0 or value > 10000:
            self.errors.append(ValidationError(
                "order_amount",
                "Amount must be between 0 and 10000 EUR",
                value
            ))
            return False
        return True
    
    def validate_products(self, products: List[Dict]) -> bool:
        if len(products) < 1 or len(products) > 10:
            self.errors.append(ValidationError(
                "products",
                "Number of products must be between 1 and 10",
                len(products)
            ))
            return False
        
        for i, product in enumerate(products):
            if not product.get('product_name'):
                self.errors.append(ValidationError(
                    f"products[{i}].product_name",
                    "Product name is required",
                    product
                ))
                return False
            
            if not (1 <= product.get('quantity', 0) <= 100):
                self.errors.append(ValidationError(
                    f"products[{i}].quantity",
                    "Quantity must be between 1 and 100",
                    product.get('quantity')
                ))
                return False
        
        return True

def process_with_validation(file_path: str, yaml_path: str, api_key: str) -> Dict[str, Any]:
    """
    Process a document with advanced validation
    """
    # API call
    url = "https://api.parsemyfile.com/api/v1/generate"
    headers = {"X-API-KEY": api_key}
    
    with open(file_path, 'rb') as file, open(yaml_path, 'rb') as yaml_file:
        files = {
            'file': (file_path, file, 'application/pdf'),
            'yaml_file': ('config.yaml', yaml_file, 'text/yaml')
        }
        
        response = requests.post(url, headers=headers, files=files)
        
        if response.status_code != 200:
            raise Exception(f"API Error: {response.text}")
        
        result = response.json()
        data = result['data']['extracted_fields']
        
        # Validation
        validator = AdvancedValidator()
        
        # Individual field validation
        validator.validate_order_number(data.get('order_number', ''))
        validator.validate_email(data.get('client_email', ''))
        validator.validate_delivery_date(data.get('delivery_date', ''))
        validator.validate_amount(float(data.get('order_amount', 0)))
        validator.validate_products(data.get('products', []))
        
        return {
            'data': data,
            'valid': len(validator.errors) == 0,
            'errors': validator.errors,
            'metadata': result['data']['metadata']
        }

# Usage
try:
    result = process_with_validation('order.pdf', 'advanced-validation.yaml', 'your_api_key')
    
    if result['valid']:
        print("✅ Document processed and validated successfully")
        print(f"Order number: {result['data']['order_number']}")
        print(f"Client: {result['data']['client_email']}")
        print(f"Amount: {result['data']['order_amount']} EUR")
    else:
        print("❌ Validation errors detected:")
        for error in result['errors']:
            print(f"  - {error.field}: {error.message} (value: {error.value})")
            
except Exception as e:
    print(f"Error: {e}")

Example 4: Database Integration

Python Code with Database Integration

python
import sqlite3
import requests
import json
from typing import Dict, Any
from datetime import datetime

class DocumentProcessor:
    def __init__(self, api_key: str, db_path: str = "documents.db"):
        self.api_key = api_key
        self.db_path = db_path
        self.init_database()
    
    def init_database(self):
        """Initialize the database"""
        conn = sqlite3.connect(self.db_path)
        cursor = conn.cursor()
        
        cursor.execute('''
            CREATE TABLE IF NOT EXISTS documents (
                id INTEGER PRIMARY KEY AUTOINCREMENT,
                filename TEXT NOT NULL,
                document_type TEXT NOT NULL,
                processed_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
                status TEXT NOT NULL,
                confidence_score REAL,
                extracted_data TEXT,
                error_message TEXT
            )
        ''')
        
        cursor.execute('''
            CREATE TABLE IF NOT EXISTS extracted_fields (
                id INTEGER PRIMARY KEY AUTOINCREMENT,
                document_id INTEGER,
                field_name TEXT NOT NULL,
                field_value TEXT,
                field_type TEXT,
                confidence REAL,
                FOREIGN KEY (document_id) REFERENCES documents (id)
            )
        ''')
        
        conn.commit()
        conn.close()
    
    def process_document(self, file_path: str, yaml_path: str, document_type: str) -> Dict[str, Any]:
        """Process a document and save to database"""
        conn = sqlite3.connect(self.db_path)
        cursor = conn.cursor()
        
        try:
            # API call
            result = self.call_api(file_path, yaml_path)
            
            if result['status'] == 'success':
                # Save document
                cursor.execute('''
                    INSERT INTO documents (filename, document_type, status, confidence_score, extracted_data)
                    VALUES (?, ?, ?, ?, ?)
                ''', (
                    file_path,
                    document_type,
                    'success',
                    result['data']['metadata']['confidence_score'],
                    json.dumps(result['data']['extracted_fields'])
                ))
                
                document_id = cursor.lastrowid
                
                # Save extracted fields
                for field_name, field_value in result['data']['extracted_fields'].items():
                    cursor.execute('''
                        INSERT INTO extracted_fields (document_id, field_name, field_value, field_type, confidence)
                        VALUES (?, ?, ?, ?, ?)
                    ''', (
                        document_id,
                        field_name,
                        str(field_value),
                        'text',  # Default type
                        result['data']['metadata']['confidence_score']
                    ))
                
                conn.commit()
                
                return {
                    'success': True,
                    'document_id': document_id,
                    'data': result['data']
                }
            else:
                # Save error
                cursor.execute('''
                    INSERT INTO documents (filename, document_type, status, error_message)
                    VALUES (?, ?, ?, ?)
                ''', (file_path, document_type, 'error', str(result.get('error', 'Unknown error'))))
                
                conn.commit()
                
                return {
                    'success': False,
                    'error': result.get('error', 'Unknown error')
                }
                
        except Exception as e:
            # Save exception
            cursor.execute('''
                INSERT INTO documents (filename, document_type, status, error_message)
                VALUES (?, ?, ?, ?)
            ''', (file_path, document_type, 'error', str(e)))
            
            conn.commit()
            
            return {
                'success': False,
                'error': str(e)
            }
        finally:
            conn.close()
    
    def call_api(self, file_path: str, yaml_path: str) -> Dict[str, Any]:
        """Call to ParseMyFile API"""
        url = "https://api.parsemyfile.com/api/v1/generate"
        headers = {"X-API-KEY": self.api_key}
        
        with open(file_path, 'rb') as file, open(yaml_path, 'rb') as yaml_file:
            files = {
                'file': (file_path, file, 'application/pdf'),
                'yaml_file': ('config.yaml', yaml_file, 'text/yaml')
            }
            
            response = requests.post(url, headers=headers, files=files)
            
            if response.status_code == 200:
                return response.json()
            else:
                return {
                    'status': 'error',
                    'error': f"API Error: {response.status_code} - {response.text}"
                }
    
    def get_document_stats(self) -> Dict[str, Any]:
        """Get processed documents statistics"""
        conn = sqlite3.connect(self.db_path)
        cursor = conn.cursor()
        
        # General statistics
        cursor.execute('''
            SELECT 
                COUNT(*) as total,
                SUM(CASE WHEN status = 'success' THEN 1 ELSE 0 END) as success,
                SUM(CASE WHEN status = 'error' THEN 1 ELSE 0 END) as errors,
                AVG(confidence_score) as avg_confidence
            FROM documents
        ''')
        
        stats = cursor.fetchone()
        
        # Statistics by document type
        cursor.execute('''
            SELECT document_type, COUNT(*) as count
            FROM documents
            GROUP BY document_type
        ''')
        
        by_type = dict(cursor.fetchall())
        
        conn.close()
        
        return {
            'total_documents': stats[0],
            'successful': stats[1],
            'errors': stats[2],
            'average_confidence': stats[3],
            'by_type': by_type
        }
    
    def search_documents(self, field_name: str, field_value: str) -> List[Dict[str, Any]]:
        """Search documents by field value"""
        conn = sqlite3.connect(self.db_path)
        cursor = conn.cursor()
        
        cursor.execute('''
            SELECT d.*, ef.field_value
            FROM documents d
            JOIN extracted_fields ef ON d.id = ef.document_id
            WHERE ef.field_name = ? AND ef.field_value LIKE ?
        ''', (field_name, f"%{field_value}%"))
        
        results = []
        for row in cursor.fetchall():
            results.append({
                'id': row[0],
                'filename': row[1],
                'document_type': row[2],
                'processed_at': row[3],
                'status': row[4],
                'confidence_score': row[5],
                'field_value': row[7]
            })
        
        conn.close()
        return results

# Usage
processor = DocumentProcessor('your_api_key')

# Process a document
result = processor.process_document('invoice.pdf', 'invoice.yaml', 'invoice')

if result['success']:
    print(f"✅ Document processed successfully (ID: {result['document_id']})")
else:
    print(f"❌ Error: {result['error']}")

# Statistics
stats = processor.get_document_stats()
print(f"Statistics: {stats['successful']}/{stats['total_documents']} documents processed successfully")

# Search
results = processor.search_documents('client_email', 'john@example.com')
print(f"Documents found: {len(results)}")

Example 5: Custom REST API

Flask Code for REST API

python
from flask import Flask, request, jsonify
import requests
import os
from werkzeug.utils import secure_filename
import json

app = Flask(__name__)
app.config['MAX_CONTENT_LENGTH'] = 10 * 1024 * 1024  # 10 MB max

# Configuration
PARSE_MY_FILE_API_KEY = os.getenv('PARSE_MY_FILE_API_KEY')
PARSE_MY_FILE_BASE_URL = 'https://api.parsemyfile.com'

@app.route('/api/process', methods=['POST'])
def process_document():
    """Endpoint to process a document"""
    try:
        # Check files
        if 'file' not in request.files or 'yaml_file' not in request.files:
            return jsonify({'error': 'Missing files'}), 400
        
        file = request.files['file']
        yaml_file = request.files['yaml_file']
        
        if file.filename == '' or yaml_file.filename == '':
            return jsonify({'error': 'No file selected'}), 400
        
        # Call to ParseMyFile API
        files = {
            'file': (secure_filename(file.filename), file, file.content_type),
            'yaml_file': (secure_filename(yaml_file.filename), yaml_file, 'text/yaml')
        }
        
        headers = {'X-API-KEY': PARSE_MY_FILE_API_KEY}
        
        response = requests.post(
            f'{PARSE_MY_FILE_BASE_URL}/api/v1/generate',
            headers=headers,
            files=files
        )
        
        if response.status_code == 200:
            result = response.json()
            return jsonify({
                'success': True,
                'data': result['data'],
                'processing_info': result.get('processing_info', {})
            })
        else:
            return jsonify({
                'success': False,
                'error': f"API Error: {response.status_code}",
                'details': response.text
            }), response.status_code
            
    except Exception as e:
        return jsonify({
            'success': False,
            'error': str(e)
        }), 500

@app.route('/api/health', methods=['GET'])
def health_check():
    """API health check"""
    try:
        response = requests.get(f'{PARSE_MY_FILE_BASE_URL}/health')
        
        if response.status_code == 200:
            return jsonify({
                'status': 'healthy',
                'parse_my_file_api': response.json()
            })
        else:
            return jsonify({
                'status': 'unhealthy',
                'parse_my_file_api': 'unavailable'
            }), 503
            
    except Exception as e:
        return jsonify({
            'status': 'unhealthy',
            'error': str(e)
        }), 503

@app.route('/api/templates', methods=['GET'])
def get_templates():
    """Get available configuration templates"""
    templates = {
        'invoice': {
            'name': 'Invoice',
            'description': 'Configuration for processing invoices',
            'yaml': '''
fields:
  - name: "invoice_number"
    type: "text"
    position: "top-right"
    required: true
  - name: "total_amount"
    type: "currency"
    position: "bottom-right"
    currency: "EUR"
'''
        },
        'contract': {
            'name': 'Contract',
            'description': 'Configuration for processing contracts',
            'yaml': '''
fields:
  - name: "contract_title"
    type: "text"
    position: "top-center"
  - name: "contracting_parties"
    type: "text"
    position: "center"
    multiline: true
'''
        }
    }
    
    return jsonify(templates)

if __name__ == '__main__':
    app.run(debug=True, host='0.0.0.0', port=5000)

HTML Code for User Interface

html
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>ParseMyFile - Document Processing Interface</title>
    <style>
        body { font-family: Arial, sans-serif; margin: 40px; }
        .container { max-width: 800px; margin: 0 auto; }
        .form-group { margin-bottom: 20px; }
        label { display: block; margin-bottom: 5px; font-weight: bold; }
        input[type="file"] { width: 100%; padding: 10px; }
        button { background: #007bff; color: white; padding: 10px 20px; border: none; cursor: pointer; }
        button:hover { background: #0056b3; }
        .result { margin-top: 20px; padding: 20px; background: #f8f9fa; border-radius: 5px; }
        .error { background: #f8d7da; color: #721c24; }
        .success { background: #d4edda; color: #155724; }
    </style>
</head>
<body>
    <div class="container">
        <h1>ParseMyFile - Document Processing</h1>
        
        <form id="processForm" enctype="multipart/form-data">
            <div class="form-group">
                <label for="file">Document to process:</label>
                <input type="file" id="file" name="file" accept=".pdf,.jpg,.jpeg,.png,.tiff,.bmp" required>
            </div>
            
            <div class="form-group">
                <label for="yaml_file">YAML Configuration:</label>
                <input type="file" id="yaml_file" name="yaml_file" accept=".yaml,.yml" required>
            </div>
            
            <button type="submit">Process Document</button>
        </form>
        
        <div id="result" class="result" style="display: none;"></div>
    </div>

    <script>
        document.getElementById('processForm').addEventListener('submit', async (e) => {
            e.preventDefault();
            
            const formData = new FormData();
            const fileInput = document.getElementById('file');
            const yamlInput = document.getElementById('yaml_file');
            const resultDiv = document.getElementById('result');
            
            if (fileInput.files.length === 0 || yamlInput.files.length === 0) {
                showResult('Please select both files', 'error');
                return;
            }
            
            formData.append('file', fileInput.files[0]);
            formData.append('yaml_file', yamlInput.files[0]);
            
            try {
                showResult('Processing...', 'info');
                
                const response = await fetch('/api/process', {
                    method: 'POST',
                    body: formData
                });
                
                const result = await response.json();
                
                if (result.success) {
                    showResult(`
                        <h3>✅ Document processed successfully</h3>
                        <h4>Extracted data:</h4>
                        <pre>${JSON.stringify(result.data.extracted_fields, null, 2)}</pre>
                        <h4>Metadata:</h4>
                        <pre>${JSON.stringify(result.data.metadata, null, 2)}</pre>
                    `, 'success');
                } else {
                    showResult(`❌ Error: ${result.error}`, 'error');
                }
            } catch (error) {
                showResult(`❌ Error: ${error.message}`, 'error');
            }
        });
        
        function showResult(message, type) {
            const resultDiv = document.getElementById('result');
            resultDiv.innerHTML = message;
            resultDiv.className = `result ${type}`;
            resultDiv.style.display = 'block';
        }
    </script>
</body>
</html>

These advanced examples show how to use the ParseMyFile API in complex and professional contexts, with validation, database integration, and custom REST API creation.

ParseMyFile API Documentation