Examples

Extract Text from Webpage

Extract clean, readable text from any webpage or URL. The API automatically removes navigation elements, ads, and formatting to give you just the content.

Overview

The /extract/url endpoint accepts a URL and returns the extracted text content. The API handles JavaScript-rendered pages, removes noise, and extracts the main content.

cURL Example

Code
 
curl -X POST https://api.crawler.dev/v1/extract/url \
  -H "x-api-key: YOUR_API_KEY" \
  -H "Content-Type: application/json" \
  -d '{
    "url": "https://example.com/article",
    "cleanText": true
  }'

Response:


Code
 
{
  "url": "https://example.com/article",
  "finalUrl": "https://example.com/article",
  "statusCode": 200,
  "contentType": "text/html",
  "size": 2048,
  "text": "This is the clean extracted text from the webpage..."
}

JavaScript Example

Basic Usage


Code
 
async function extractTextFromUrl(url, cleanText = true) {
  const response = await fetch('https://api.crawler.dev/v1/extract/url', {
    method: 'POST',
    headers: {
      'x-api-key': 'YOUR_API_KEY',
      'Content-Type': 'application/json'
    },
    body: JSON.stringify({
      url: url,
      cleanText: cleanText
    })
  });

  if (!response.ok) {
    const error = await response.json();
    throw new Error(error.error?.message || 'Failed to extract text');
  }

  const result = await response.json();
  return result.text;
}

// Usage
extractTextFromUrl('https://example.com/article')
  .then(text => {
    console.log('Extracted text:', text);
  })
  .catch(error => {
    console.error('Error:', error.message);
  });

With Error Handling


Code
 
async function extractTextFromUrlSafe(url) {
  try {
    const response = await fetch('https://api.crawler.dev/v1/extract/url', {
      method: 'POST',
      headers: {
        'x-api-key': process.env.CRAWLER_API_KEY,
        'Content-Type': 'application/json'
      },
      body: JSON.stringify({
        url: url,
        cleanText: true
      })
    });

    const data = await response.json();

    if (!response.ok) {
      if (response.status === 400) {
        throw new Error(`Invalid URL: ${data.error?.message}`);
      } else if (response.status === 401) {
        throw new Error('Invalid API key');
      } else if (response.status === 404) {
        throw new Error('Page not found');
      } else if (response.status === 429) {
        throw new Error('Rate limit exceeded. Please try again later.');
      } else {
        throw new Error(data.error?.message || 'Unknown error');
      }
    }

    return {
      text: data.text,
      url: data.url,
      finalUrl: data.finalUrl,
      statusCode: data.statusCode,
      contentType: data.contentType,
      size: data.size
    };
  } catch (error) {
    if (error.name === 'TypeError' && error.message.includes('fetch')) {
      throw new Error('Network error: Check your internet connection');
    }
    throw error;
  }
}

Python Example

Basic Usage


Code
 
import requests
import os

def extract_text_from_url(url, clean_text=True):
    """
    Extract text from a webpage using the crawler.dev API.
    
    Args:
        url: URL of the webpage to extract text from
        clean_text: Whether to clean the extracted text (default: True)
        
    Returns:
        dict: API response containing extracted text and metadata
    """
    api_key = os.getenv('CRAWLER_API_KEY')
    
    if not api_key:
        raise ValueError("CRAWLER_API_KEY environment variable not set")
    
    endpoint = "https://api.crawler.dev/v1/extract/url"
    headers = {
        "x-api-key": api_key,
        "Content-Type": "application/json"
    }
    
    payload = {
        "url": url,
        "cleanText": clean_text
    }
    
    response = requests.post(endpoint, headers=headers, json=payload)
    response.raise_for_status()
    
    return response.json()

# Usage
try:
    result = extract_text_from_url('https://example.com/article')
    print(f"URL: {result.get('url', 'N/A')}")
    print(f"Status Code: {result.get('statusCode', 'N/A')}")
    print(f"Text length: {len(result['text'])} characters")
    print(result['text'])
except requests.exceptions.RequestException as e:
    print(f"Error: {e}")

With Retry Logic


Code
 
import requests
import time
from typing import Optional

def extract_text_with_retry(url: str, max_retries: int = 3, delay: int = 1) -> Optional[dict]:
    """
    Extract text from URL with automatic retry on failure.
    
    Args:
        url: URL to extract text from
        max_retries: Maximum number of retry attempts
        delay: Delay between retries in seconds
        
    Returns:
        dict: API response or None if all retries fail
    """
    api_key = os.getenv('CRAWLER_API_KEY')
    endpoint = "https://api.crawler.dev/v1/extract/url"
    headers = {
        "x-api-key": api_key,
        "Content-Type": "application/json"
    }
    payload = {"url": url, "cleanText": True}
    
    for attempt in range(max_retries):
        try:
            response = requests.post(endpoint, headers=headers, json=payload, timeout=30)
            response.raise_for_status()
            return response.json()
        except requests.exceptions.HTTPError as e:
            # Don't retry on client errors (4xx)
            if 400 <= response.status_code < 500:
                raise
            if attempt < max_retries - 1:
                print(f"Attempt {attempt + 1} failed, retrying in {delay} seconds...")
                time.sleep(delay)
            else:
                raise
        except requests.exceptions.RequestException as e:
            if attempt < max_retries - 1:
                print(f"Network error, retrying in {delay} seconds...")
                time.sleep(delay)
            else:
                raise
    
    return None

Clean Text Option

The cleanText parameter controls whether the API removes formatting and noise:

cleanText: true (default): Removes HTML tags, navigation, ads, and formatting. Returns clean, readable text.
cleanText: false: Returns raw extracted text with minimal processing.


Code
 
// Get clean text (recommended)
const cleanText = await extractTextFromUrl('https://example.com', true);

// Get raw text
const rawText = await extractTextFromUrl('https://example.com', false);

Common Use Cases

Extract Article Content


Code
 
async function extractArticle(url) {
  const result = await extractTextFromUrlSafe(url);
  return {
    url: result.url,
    finalUrl: result.finalUrl,
    content: result.text,
    wordCount: result.text.split(/\s+/).length,
    statusCode: result.statusCode,
    contentType: result.contentType
  };
}

Validate URL Before Extraction


Code
 
function isValidUrl(url) {
  try {
    new URL(url);
    return true;
  } catch {
    return false;
  }
}

async function safeExtract(url) {
  if (!isValidUrl(url)) {
    throw new Error('Invalid URL format');
  }
  
  if (!url.startsWith('http://') && !url.startsWith('https://')) {
    throw new Error('URL must start with http:// or https://');
  }
  
  return await extractTextFromUrlSafe(url);
}

Rate Limits

Be mindful of rate limits when extracting from multiple URLs:

Free tier: Check your plan limits
Use batch processing for multiple URLs
Implement exponential backoff for retries

Next Steps

Learn how to extract markdown from HTML
Explore batch processing for multiple URLs
Check out the API Reference for complete documentation

Extract Text from File Extract Markdown from HTML

Examples

Extract Text from Webpage

Extract clean, readable text from any webpage or URL. The API automatically removes navigation elements, ads, and formatting to give you just the content.

Overview

The /extract/url endpoint accepts a URL and returns the extracted text content. The API handles JavaScript-rendered pages, removes noise, and extracts the main content.

cURL Example

Code
 
curl -X POST https://api.crawler.dev/v1/extract/url \
  -H "x-api-key: YOUR_API_KEY" \
  -H "Content-Type: application/json" \
  -d '{
    "url": "https://example.com/article",
    "cleanText": true
  }'

Response:


Code
 
{
  "url": "https://example.com/article",
  "finalUrl": "https://example.com/article",
  "statusCode": 200,
  "contentType": "text/html",
  "size": 2048,
  "text": "This is the clean extracted text from the webpage..."
}

JavaScript Example

Basic Usage


Code
 
async function extractTextFromUrl(url, cleanText = true) {
  const response = await fetch('https://api.crawler.dev/v1/extract/url', {
    method: 'POST',
    headers: {
      'x-api-key': 'YOUR_API_KEY',
      'Content-Type': 'application/json'
    },
    body: JSON.stringify({
      url: url,
      cleanText: cleanText
    })
  });

  if (!response.ok) {
    const error = await response.json();
    throw new Error(error.error?.message || 'Failed to extract text');
  }

  const result = await response.json();
  return result.text;
}

// Usage
extractTextFromUrl('https://example.com/article')
  .then(text => {
    console.log('Extracted text:', text);
  })
  .catch(error => {
    console.error('Error:', error.message);
  });

With Error Handling


Code
 
async function extractTextFromUrlSafe(url) {
  try {
    const response = await fetch('https://api.crawler.dev/v1/extract/url', {
      method: 'POST',
      headers: {
        'x-api-key': process.env.CRAWLER_API_KEY,
        'Content-Type': 'application/json'
      },
      body: JSON.stringify({
        url: url,
        cleanText: true
      })
    });

    const data = await response.json();

    if (!response.ok) {
      if (response.status === 400) {
        throw new Error(`Invalid URL: ${data.error?.message}`);
      } else if (response.status === 401) {
        throw new Error('Invalid API key');
      } else if (response.status === 404) {
        throw new Error('Page not found');
      } else if (response.status === 429) {
        throw new Error('Rate limit exceeded. Please try again later.');
      } else {
        throw new Error(data.error?.message || 'Unknown error');
      }
    }

    return {
      text: data.text,
      url: data.url,
      finalUrl: data.finalUrl,
      statusCode: data.statusCode,
      contentType: data.contentType,
      size: data.size
    };
  } catch (error) {
    if (error.name === 'TypeError' && error.message.includes('fetch')) {
      throw new Error('Network error: Check your internet connection');
    }
    throw error;
  }
}

Python Example

Basic Usage


Code
 
import requests
import os

def extract_text_from_url(url, clean_text=True):
    """
    Extract text from a webpage using the crawler.dev API.
    
    Args:
        url: URL of the webpage to extract text from
        clean_text: Whether to clean the extracted text (default: True)
        
    Returns:
        dict: API response containing extracted text and metadata
    """
    api_key = os.getenv('CRAWLER_API_KEY')
    
    if not api_key:
        raise ValueError("CRAWLER_API_KEY environment variable not set")
    
    endpoint = "https://api.crawler.dev/v1/extract/url"
    headers = {
        "x-api-key": api_key,
        "Content-Type": "application/json"
    }
    
    payload = {
        "url": url,
        "cleanText": clean_text
    }
    
    response = requests.post(endpoint, headers=headers, json=payload)
    response.raise_for_status()
    
    return response.json()

# Usage
try:
    result = extract_text_from_url('https://example.com/article')
    print(f"URL: {result.get('url', 'N/A')}")
    print(f"Status Code: {result.get('statusCode', 'N/A')}")
    print(f"Text length: {len(result['text'])} characters")
    print(result['text'])
except requests.exceptions.RequestException as e:
    print(f"Error: {e}")

With Retry Logic


Code
 
import requests
import time
from typing import Optional

def extract_text_with_retry(url: str, max_retries: int = 3, delay: int = 1) -> Optional[dict]:
    """
    Extract text from URL with automatic retry on failure.
    
    Args:
        url: URL to extract text from
        max_retries: Maximum number of retry attempts
        delay: Delay between retries in seconds
        
    Returns:
        dict: API response or None if all retries fail
    """
    api_key = os.getenv('CRAWLER_API_KEY')
    endpoint = "https://api.crawler.dev/v1/extract/url"
    headers = {
        "x-api-key": api_key,
        "Content-Type": "application/json"
    }
    payload = {"url": url, "cleanText": True}
    
    for attempt in range(max_retries):
        try:
            response = requests.post(endpoint, headers=headers, json=payload, timeout=30)
            response.raise_for_status()
            return response.json()
        except requests.exceptions.HTTPError as e:
            # Don't retry on client errors (4xx)
            if 400 <= response.status_code < 500:
                raise
            if attempt < max_retries - 1:
                print(f"Attempt {attempt + 1} failed, retrying in {delay} seconds...")
                time.sleep(delay)
            else:
                raise
        except requests.exceptions.RequestException as e:
            if attempt < max_retries - 1:
                print(f"Network error, retrying in {delay} seconds...")
                time.sleep(delay)
            else:
                raise
    
    return None

Clean Text Option

The cleanText parameter controls whether the API removes formatting and noise:

cleanText: true (default): Removes HTML tags, navigation, ads, and formatting. Returns clean, readable text.
cleanText: false: Returns raw extracted text with minimal processing.


Code
 
// Get clean text (recommended)
const cleanText = await extractTextFromUrl('https://example.com', true);

// Get raw text
const rawText = await extractTextFromUrl('https://example.com', false);

Common Use Cases

Extract Article Content


Code
 
async function extractArticle(url) {
  const result = await extractTextFromUrlSafe(url);
  return {
    url: result.url,
    finalUrl: result.finalUrl,
    content: result.text,
    wordCount: result.text.split(/\s+/).length,
    statusCode: result.statusCode,
    contentType: result.contentType
  };
}

Validate URL Before Extraction


Code
 
function isValidUrl(url) {
  try {
    new URL(url);
    return true;
  } catch {
    return false;
  }
}

async function safeExtract(url) {
  if (!isValidUrl(url)) {
    throw new Error('Invalid URL format');
  }
  
  if (!url.startsWith('http://') && !url.startsWith('https://')) {
    throw new Error('URL must start with http:// or https://');
  }
  
  return await extractTextFromUrlSafe(url);
}

Rate Limits

Be mindful of rate limits when extracting from multiple URLs:

Free tier: Check your plan limits
Use batch processing for multiple URLs
Implement exponential backoff for retries

Next Steps

Learn how to extract markdown from HTML
Explore batch processing for multiple URLs
Check out the API Reference for complete documentation

Extract Text from File Extract Markdown from HTML