async function extractFromAnyFile(file) { const formData = new FormData(); formData.append('file', file); const response = await fetch('https://api.crawler.dev/v1/extract/file', { method: 'POST', headers: { 'x-api-key': 'YOUR_API_KEY' }, body: formData }); if (!response.ok) { const error = await response.json(); throw new Error(error.error?.message || 'Failed to extract text'); } const result = await response.json(); // The API response already includes filename, contentType, size, and text return result;}// Usageconst fileInput = document.querySelector('input[type="file"]');fileInput.addEventListener('change', async (e) => { const file = e.target.files[0]; if (file) { try { const result = await extractFromAnyFile(file); console.log(`Extracted from ${result.contentType}:`, result.text); console.log(`File: ${result.filename}, Size: ${result.size} bytes`); } catch (error) { console.error('Error:', error.message); } }});
Python
Code
import osimport requestsfrom pathlib import Pathdef extract_from_any_file(file_path, api_key): """ Extract text from any supported file format. Args: file_path: Path to the file api_key: Your crawler.dev API key Returns: dict: Extraction result with format information """ url = "https://api.crawler.dev/v1/extract/file" headers = {"x-api-key": api_key} with open(file_path, 'rb') as file: files = {'file': file} response = requests.post(url, headers=headers, files=files) response.raise_for_status() result = response.json() # The API response already includes filename, contentType, size, and text return result# Usageresult = extract_from_any_file('document.pdf', os.getenv('CRAWLER_API_KEY'))print(f"Content Type: {result['contentType']}")print(f"Filename: {result['filename']}")print(f"Size: {result['size']} bytes")print(f"Text: {result['text'][:200]}...")
Format-Specific Examples
PDF Documents
Code
async function extractFromPDF(file) { // PDFs are handled automatically const result = await extractFromAnyFile(file); if (result.contentType === 'application/pdf') { console.log(`PDF processed: ${result.size} bytes`); console.log(`Extracted ${result.text.length} characters`); } return result;}
Word Documents
Code
async function extractFromWord(file) { // Supports both .doc and .docx const result = await extractFromAnyFile(file); if (result.contentType.includes('wordprocessingml') || result.contentType === 'application/msword') { // Word documents may contain formatting // The API extracts clean text automatically return result.text; } throw new Error('Not a Word document');}
Excel Spreadsheets
Code
async function extractFromExcel(file) { const result = await extractFromAnyFile(file); if (result.contentType.includes('spreadsheetml') || result.contentType === 'application/vnd.ms-excel') { // Excel text is extracted cell by cell // You may want to parse it further const lines = result.text.split('\n'); return lines.map(line => line.split('\t')); // Tab-separated values } throw new Error('Not an Excel file');}
PowerPoint Presentations
Code
async function extractFromPowerPoint(file) { const result = await extractFromAnyFile(file); if (result.contentType.includes('presentationml') || result.contentType === 'application/vnd.ms-powerpoint') { // Slides are extracted sequentially // Each slide's text is separated return result.text; } throw new Error('Not a PowerPoint file');}
async function extractWithContentTypeHandling(file) { try { const result = await extractFromAnyFile(file); // Content-type-specific processing if (result.contentType === 'application/pdf') { if (result.text.length < 100) { console.warn('PDF may be image-based or empty'); } } else if (result.contentType.startsWith('image/')) { console.log('Using OCR for image extraction'); } else if (result.contentType.includes('spreadsheetml') || result.contentType === 'application/vnd.ms-excel') { console.log('Excel data extracted as text'); } return result; } catch (error) { if (error.message.includes('unsupported')) { throw new Error(`File format not supported: ${file.name}`); } throw error; }}
Tips for Different Formats
PDFs: May contain images - OCR is handled automatically
Word Documents: Formatting is removed, clean text is extracted
Excel Files: Cell values are extracted, formulas are not evaluated