Examples

Clean Text Extraction

Extract clean, readable text content without HTML tags, navigation elements, ads, or formatting noise. Perfect for content analysis, search indexing, and text processing.

Overview

The cleanText option removes unnecessary elements from extracted content, giving you just the meaningful text. This is especially useful when extracting from web pages where you want to ignore navigation, ads, and other non-content elements.

Basic Usage

JavaScript


Code
 
async function extractCleanText(url) {
  const response = await fetch('https://api.crawler.dev/v1/extract/url', {
    method: 'POST',
    headers: {
      'x-api-key': 'YOUR_API_KEY',
      'Content-Type': 'application/json'
    },
    body: JSON.stringify({
      url: url,
      cleanText: true  // Enable clean text extraction
    })
  });

  const result = await response.json();
  return result.text;
}

Python


Code
 
import requests

def extract_clean_text(url, api_key):
    response = requests.post(
        'https://api.crawler.dev/v1/extract/url',
        headers={
            'x-api-key': api_key,
            'Content-Type': 'application/json'
        },
        json={
            'url': url,
            'cleanText': True  # Enable clean text extraction
        }
    )
    response.raise_for_status()
    return response.json()['text']

Comparison: Clean vs Raw Text

Clean Text (cleanText: true)


Code
 
// Clean text removes:
// - HTML tags
// - Navigation menus
// - Advertisements
// - Footer content
// - Sidebar elements
// - Formatting noise

const cleanText = await extractCleanText('https://example.com/article');
// Result: Just the article content, clean and readable

Raw Text (cleanText: false)


Code
 
async function extractRawText(url) {
  const response = await fetch('https://api.crawler.dev/v1/extract/url', {
    method: 'POST',
    headers: {
      'x-api-key': 'YOUR_API_KEY',
      'Content-Type': 'application/json'
    },
    body: JSON.stringify({
      url: url,
      cleanText: false  // Get raw extracted text
    })
  });

  const result = await response.json();
  return result.text;
}

// Raw text may include:
// - More formatting artifacts
// - Navigation elements
// - Additional metadata

Use Cases

Content Analysis


Code
 
async function analyzeContent(url) {
  const cleanText = await extractCleanText(url);
  
  // Analyze the clean content
  const words = cleanText.split(/\s+/);
  const sentences = cleanText.split(/[.!?]+/).filter(s => s.trim().length > 0);
  const paragraphs = cleanText.split(/\n\n+/).filter(p => p.trim().length > 0);
  
  return {
    wordCount: words.length,
    sentenceCount: sentences.length,
    paragraphCount: paragraphs.length,
    averageWordsPerSentence: words.length / sentences.length,
    readingTime: Math.ceil(words.length / 200) // Assuming 200 words per minute
  };
}

Search Indexing


Code
 
async function indexContent(url) {
  const cleanText = await extractCleanText(url);
  
  // Extract keywords (simple example)
  const words = cleanText.toLowerCase()
    .replace(/[^\w\s]/g, '')
    .split(/\s+/)
    .filter(word => word.length > 3);
  
  // Count word frequency
  const wordFreq = {};
  words.forEach(word => {
    wordFreq[word] = (wordFreq[word] || 0) + 1;
  });
  
  // Get top keywords
  const topKeywords = Object.entries(wordFreq)
    .sort((a, b) => b[1] - a[1])
    .slice(0, 10)
    .map(([word]) => word);
  
  return {
    url: url,
    text: cleanText,
    keywords: topKeywords,
    wordCount: words.length
  };
}

Text Summarization Preparation


Code
 
def prepare_for_summarization(url, api_key):
    """
    Extract clean text and prepare it for summarization.
    """
    clean_text = extract_clean_text(url, api_key)
    
    # Split into sentences
    import re
    sentences = re.split(r'[.!?]+', clean_text)
    sentences = [s.strip() for s in sentences if len(s.strip()) > 20]
    
    # Calculate sentence scores (simple length-based)
    sentence_scores = [
        {'text': s, 'score': len(s.split())}
        for s in sentences
    ]
    
    return {
        'full_text': clean_text,
        'sentences': sentence_scores,
        'total_sentences': len(sentences),
        'word_count': len(clean_text.split())
    }

Post-Processing Clean Text

Sometimes you may want additional cleaning:


Code
 
function postProcessCleanText(text) {
  // Remove extra whitespace
  let cleaned = text.replace(/\s+/g, ' ');
  
  // Remove leading/trailing whitespace from each line
  cleaned = cleaned.split('\n')
    .map(line => line.trim())
    .filter(line => line.length > 0)
    .join('\n');
  
  // Remove duplicate blank lines
  cleaned = cleaned.replace(/\n{3,}/g, '\n\n');
  
  return cleaned.trim();
}

async function getSuperCleanText(url) {
  const cleanText = await extractCleanText(url);
  return postProcessCleanText(cleanText);
}

Extracting from Files

Clean text extraction also works with file uploads:


Code
 
async function extractCleanTextFromFile(file) {
  const formData = new FormData();
  formData.append('file', file);

  const response = await fetch('https://api.crawler.dev/v1/extract/file', {
    method: 'POST',
    headers: {
      'x-api-key': 'YOUR_API_KEY'
    },
    body: formData
  });

  const result = await response.json();
  // File extraction automatically applies clean text processing
  return result.text;
}

Best Practices

Always use cleanText: true for web pages unless you need raw HTML
Post-process if needed for specific formatting requirements
Handle empty results - Some pages may not have extractable content
Validate content - Check that extracted text meets your quality standards

Common Patterns

Extract Main Content Only


Code
 
async function extractMainContent(url) {
  const cleanText = await extractCleanText(url);
  
  // Remove very short lines (likely navigation or metadata)
  const lines = cleanText.split('\n')
    .filter(line => line.trim().length > 20)
    .join('\n');
  
  return lines;
}

Extract with Word Count Filter


Code
 
async function extractWithMinWords(url, minWords = 100) {
  const cleanText = await extractCleanText(url);
  const wordCount = cleanText.split(/\s+/).length;
  
  if (wordCount < minWords) {
    throw new Error(`Content too short: ${wordCount} words (minimum: ${minWords})`);
  }
  
  return cleanText;
}

Next Steps

Learn about extracting from web pages
Explore batch processing for multiple URLs
Check out the API Reference for complete documentation

Batch Processing Multiple File Formats