Extract clean, readable text from any webpage or URL. The API automatically removes navigation elements, ads, and formatting to give you just the content.
Overview
The /extract/url endpoint accepts a URL and returns the extracted text content. The API handles JavaScript-rendered pages, removes noise, and extracts the main content.
{ "url": "https://example.com/article", "finalUrl": "https://example.com/article", "statusCode": 200, "contentType": "text/html", "size": 2048, "text": "This is the clean extracted text from the webpage..."}
async function extractTextFromUrlSafe(url) { try { const response = await fetch('https://api.crawler.dev/v1/extract/url', { method: 'POST', headers: { 'x-api-key': process.env.CRAWLER_API_KEY, 'Content-Type': 'application/json' }, body: JSON.stringify({ url: url, cleanText: true }) }); const data = await response.json(); if (!response.ok) { if (response.status === 400) { throw new Error(`Invalid URL: ${data.error?.message}`); } else if (response.status === 401) { throw new Error('Invalid API key'); } else if (response.status === 404) { throw new Error('Page not found'); } else if (response.status === 429) { throw new Error('Rate limit exceeded. Please try again later.'); } else { throw new Error(data.error?.message || 'Unknown error'); } } return { text: data.text, url: data.url, finalUrl: data.finalUrl, statusCode: data.statusCode, contentType: data.contentType, size: data.size }; } catch (error) { if (error.name === 'TypeError' && error.message.includes('fetch')) { throw new Error('Network error: Check your internet connection'); } throw error; }}
Python Example
Basic Usage
Code
import requestsimport osdef extract_text_from_url(url, clean_text=True): """ Extract text from a webpage using the crawler.dev API. Args: url: URL of the webpage to extract text from clean_text: Whether to clean the extracted text (default: True) Returns: dict: API response containing extracted text and metadata """ api_key = os.getenv('CRAWLER_API_KEY') if not api_key: raise ValueError("CRAWLER_API_KEY environment variable not set") endpoint = "https://api.crawler.dev/v1/extract/url" headers = { "x-api-key": api_key, "Content-Type": "application/json" } payload = { "url": url, "cleanText": clean_text } response = requests.post(endpoint, headers=headers, json=payload) response.raise_for_status() return response.json()# Usagetry: result = extract_text_from_url('https://example.com/article') print(f"URL: {result.get('url', 'N/A')}") print(f"Status Code: {result.get('statusCode', 'N/A')}") print(f"Text length: {len(result['text'])} characters") print(result['text'])except requests.exceptions.RequestException as e: print(f"Error: {e}")
With Retry Logic
Code
import requestsimport timefrom typing import Optionaldef extract_text_with_retry(url: str, max_retries: int = 3, delay: int = 1) -> Optional[dict]: """ Extract text from URL with automatic retry on failure. Args: url: URL to extract text from max_retries: Maximum number of retry attempts delay: Delay between retries in seconds Returns: dict: API response or None if all retries fail """ api_key = os.getenv('CRAWLER_API_KEY') endpoint = "https://api.crawler.dev/v1/extract/url" headers = { "x-api-key": api_key, "Content-Type": "application/json" } payload = {"url": url, "cleanText": True} for attempt in range(max_retries): try: response = requests.post(endpoint, headers=headers, json=payload, timeout=30) response.raise_for_status() return response.json() except requests.exceptions.HTTPError as e: # Don't retry on client errors (4xx) if 400 <= response.status_code < 500: raise if attempt < max_retries - 1: print(f"Attempt {attempt + 1} failed, retrying in {delay} seconds...") time.sleep(delay) else: raise except requests.exceptions.RequestException as e: if attempt < max_retries - 1: print(f"Network error, retrying in {delay} seconds...") time.sleep(delay) else: raise return None
Clean Text Option
The cleanText parameter controls whether the API removes formatting and noise:
cleanText: true (default): Removes HTML tags, navigation, ads, and formatting. Returns clean, readable text.
cleanText: false: Returns raw extracted text with minimal processing.
Code
// Get clean text (recommended)const cleanText = await extractTextFromUrl('https://example.com', true);// Get raw textconst rawText = await extractTextFromUrl('https://example.com', false);
function isValidUrl(url) { try { new URL(url); return true; } catch { return false; }}async function safeExtract(url) { if (!isValidUrl(url)) { throw new Error('Invalid URL format'); } if (!url.startsWith('http://') && !url.startsWith('https://')) { throw new Error('URL must start with http:// or https://'); } return await extractTextFromUrlSafe(url);}
Rate Limits
Be mindful of rate limits when extracting from multiple URLs: