Examples

Multiple File Formats

The crawler.dev API supports a wide variety of file formats. Learn how to handle different document types and extract text from each format.

Supported Formats

The API automatically detects and processes many file formats:

Documents: PDF, DOC, DOCX, RTF, TXT, ODT
Spreadsheets: XLS, XLSX, CSV, ODS
Presentations: PPT, PPTX, ODP
Images: PNG, JPG, JPEG (with OCR)
Web: HTML, HTM
Archives: ZIP (extracts and processes contents)
And many more

Universal File Handler

JavaScript


Code
 
async function extractFromAnyFile(file) {
  const formData = new FormData();
  formData.append('file', file);

  const response = await fetch('https://api.crawler.dev/v1/extract/file', {
    method: 'POST',
    headers: {
      'x-api-key': 'YOUR_API_KEY'
    },
    body: formData
  });

  if (!response.ok) {
    const error = await response.json();
    throw new Error(error.error?.message || 'Failed to extract text');
  }

  const result = await response.json();
  
  // The API response already includes filename, contentType, size, and text
  return result;
}

// Usage
const fileInput = document.querySelector('input[type="file"]');
fileInput.addEventListener('change', async (e) => {
  const file = e.target.files[0];
  if (file) {
    try {
      const result = await extractFromAnyFile(file);
      console.log(`Extracted from ${result.contentType}:`, result.text);
      console.log(`File: ${result.filename}, Size: ${result.size} bytes`);
    } catch (error) {
      console.error('Error:', error.message);
    }
  }
});

Python


Code
 
import os
import requests
from pathlib import Path

def extract_from_any_file(file_path, api_key):
    """
    Extract text from any supported file format.
    
    Args:
        file_path: Path to the file
        api_key: Your crawler.dev API key
        
    Returns:
        dict: Extraction result with format information
    """
    url = "https://api.crawler.dev/v1/extract/file"
    headers = {"x-api-key": api_key}
    
    with open(file_path, 'rb') as file:
        files = {'file': file}
        response = requests.post(url, headers=headers, files=files)
    
    response.raise_for_status()
    result = response.json()
    
    # The API response already includes filename, contentType, size, and text
    return result

# Usage
result = extract_from_any_file('document.pdf', os.getenv('CRAWLER_API_KEY'))
print(f"Content Type: {result['contentType']}")
print(f"Filename: {result['filename']}")
print(f"Size: {result['size']} bytes")
print(f"Text: {result['text'][:200]}...")

Format-Specific Examples

PDF Documents


Code
 
async function extractFromPDF(file) {
  // PDFs are handled automatically
  const result = await extractFromAnyFile(file);
  
  if (result.contentType === 'application/pdf') {
    console.log(`PDF processed: ${result.size} bytes`);
    console.log(`Extracted ${result.text.length} characters`);
  }
  
  return result;
}

Word Documents


Code
 
async function extractFromWord(file) {
  // Supports both .doc and .docx
  const result = await extractFromAnyFile(file);
  
  if (result.contentType.includes('wordprocessingml') || 
      result.contentType === 'application/msword') {
    // Word documents may contain formatting
    // The API extracts clean text automatically
    return result.text;
  }
  
  throw new Error('Not a Word document');
}

Excel Spreadsheets


Code
 
async function extractFromExcel(file) {
  const result = await extractFromAnyFile(file);
  
  if (result.contentType.includes('spreadsheetml') || 
      result.contentType === 'application/vnd.ms-excel') {
    // Excel text is extracted cell by cell
    // You may want to parse it further
    const lines = result.text.split('\n');
    return lines.map(line => line.split('\t')); // Tab-separated values
  }
  
  throw new Error('Not an Excel file');
}

PowerPoint Presentations


Code
 
async function extractFromPowerPoint(file) {
  const result = await extractFromAnyFile(file);
  
  if (result.contentType.includes('presentationml') || 
      result.contentType === 'application/vnd.ms-powerpoint') {
    // Slides are extracted sequentially
    // Each slide's text is separated
    return result.text;
  }
  
  throw new Error('Not a PowerPoint file');
}

Batch Processing Multiple Formats


Code
 
async function processMultipleFormats(files) {
  const results = await Promise.allSettled(
    files.map(file => extractFromAnyFile(file))
  );
  
  const byContentType = {};
  
  results.forEach((result, index) => {
    if (result.status === 'fulfilled') {
      const contentType = result.value.contentType;
      if (!byContentType[contentType]) {
        byContentType[contentType] = [];
      }
      byContentType[contentType].push(result.value);
    }
  });
  
  return byContentType;
}

// Usage
const files = [/* array of File objects */];
const grouped = await processMultipleFormats(files);

console.log(`Processed ${Object.keys(grouped).length} different content types`);
Object.entries(grouped).forEach(([contentType, results]) => {
  console.log(`${contentType}: ${results.length} files`);
});

Format Detection and Validation


Code
 
function validateFileFormat(file, allowedFormats) {
  const ext = file.name.split('.').pop().toLowerCase();
  const contentType = file.type;
  
  const formatMap = {
    'pdf': ['application/pdf'],
    'doc': ['application/msword'],
    'docx': ['application/vnd.openxmlformats-officedocument.wordprocessingml.document'],
    'xls': ['application/vnd.ms-excel'],
    'xlsx': ['application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'],
    'txt': ['text/plain'],
    'html': ['text/html'],
    'png': ['image/png'],
    'jpg': ['image/jpeg'],
    'jpeg': ['image/jpeg']
  };
  
  const validTypes = formatMap[ext] || [];
  const isValid = allowedFormats.includes(ext) && 
                  (validTypes.length === 0 || validTypes.includes(contentType));
  
  return {
    isValid,
    format: ext,
    reason: isValid ? 'Valid' : 'Format not supported or not in allowed list'
  };
}

// Usage
const file = document.querySelector('input[type="file"]').files[0];
const allowed = ['pdf', 'doc', 'docx', 'txt'];
const validation = validateFileFormat(file, allowed);

if (!validation.isValid) {
  console.error(`Invalid file: ${validation.reason}`);
}

Error Handling by Content Type


Code
 
async function extractWithContentTypeHandling(file) {
  try {
    const result = await extractFromAnyFile(file);
    
    // Content-type-specific processing
    if (result.contentType === 'application/pdf') {
      if (result.text.length < 100) {
        console.warn('PDF may be image-based or empty');
      }
    } else if (result.contentType.startsWith('image/')) {
      console.log('Using OCR for image extraction');
    } else if (result.contentType.includes('spreadsheetml') || 
               result.contentType === 'application/vnd.ms-excel') {
      console.log('Excel data extracted as text');
    }
    
    return result;
  } catch (error) {
    if (error.message.includes('unsupported')) {
      throw new Error(`File format not supported: ${file.name}`);
    }
    throw error;
  }
}

Tips for Different Formats

PDFs: May contain images - OCR is handled automatically
Word Documents: Formatting is removed, clean text is extracted
Excel Files: Cell values are extracted, formulas are not evaluated
PowerPoint: Slide text is extracted sequentially
Images: OCR is used for text extraction
HTML: Clean text is extracted, tags are removed

Next Steps

Learn about extracting from files
Explore clean text extraction
Check the Supported File Types page for complete list
See the API Reference for complete documentation

Clean Text Extraction