Extract clean, readable text content without HTML tags, navigation elements, ads, or formatting noise. Perfect for content analysis, search indexing, and text processing.
Overview
The cleanText option removes unnecessary elements from extracted content, giving you just the meaningful text. This is especially useful when extracting from web pages where you want to ignore navigation, ads, and other non-content elements.
Basic Usage
JavaScript
async function extractCleanText ( url ) {
const response = await fetch ( 'https://api.crawler.dev/v1/extract/url' , {
method: 'POST' ,
headers: {
'x-api-key' : 'YOUR_API_KEY' ,
'Content-Type' : 'application/json'
},
body: JSON . stringify ({
url: url,
cleanText: true // Enable clean text extraction
})
});
const result = await response. json ();
return result.text;
}
Python
import requests
def extract_clean_text (url, api_key):
response = requests.post(
'https://api.crawler.dev/v1/extract/url' ,
headers = {
'x-api-key' : api_key,
'Content-Type' : 'application/json'
},
json = {
'url' : url,
'cleanText' : True # Enable clean text extraction
}
)
response.raise_for_status()
return response.json()[ 'text' ]
Comparison: Clean vs Raw Text
Clean Text (cleanText: true)
// Clean text removes:
// - HTML tags
// - Navigation menus
// - Advertisements
// - Footer content
// - Sidebar elements
// - Formatting noise
const cleanText = await extractCleanText ( 'https://example.com/article' );
// Result: Just the article content, clean and readable
Raw Text (cleanText: false)
async function extractRawText ( url ) {
const response = await fetch ( 'https://api.crawler.dev/v1/extract/url' , {
method: 'POST' ,
headers: {
'x-api-key' : 'YOUR_API_KEY' ,
'Content-Type' : 'application/json'
},
body: JSON . stringify ({
url: url,
cleanText: false // Get raw extracted text
})
});
const result = await response. json ();
return result.text;
}
// Raw text may include:
// - More formatting artifacts
// - Navigation elements
// - Additional metadata
Use Cases
Content Analysis
async function analyzeContent ( url ) {
const cleanText = await extractCleanText (url);
// Analyze the clean content
const words = cleanText. split ( / \s + / );
const sentences = cleanText. split ( / [.!?] + / ). filter ( s => s. trim (). length > 0 );
const paragraphs = cleanText. split ( / \n\n + / ). filter ( p => p. trim (). length > 0 );
return {
wordCount: words. length ,
sentenceCount: sentences. length ,
paragraphCount: paragraphs. length ,
averageWordsPerSentence: words. length / sentences. length ,
readingTime: Math. ceil (words. length / 200 ) // Assuming 200 words per minute
};
}
Search Indexing
async function indexContent ( url ) {
const cleanText = await extractCleanText (url);
// Extract keywords (simple example)
const words = cleanText. toLowerCase ()
. replace ( / [ ^ \w\s] / g , '' )
. split ( / \s + / )
. filter ( word => word. length > 3 );
// Count word frequency
const wordFreq = {};
words. forEach ( word => {
wordFreq[word] = (wordFreq[word] || 0 ) + 1 ;
});
// Get top keywords
const topKeywords = Object. entries (wordFreq)
. sort (( a , b ) => b[ 1 ] - a[ 1 ])
. slice ( 0 , 10 )
. map (([ word ]) => word);
return {
url: url,
text: cleanText,
keywords: topKeywords,
wordCount: words. length
};
}
Text Summarization Preparation
def prepare_for_summarization (url, api_key):
"""
Extract clean text and prepare it for summarization.
"""
clean_text = extract_clean_text(url, api_key)
# Split into sentences
import re
sentences = re.split( r ' [.!?] + ' , clean_text)
sentences = [s.strip() for s in sentences if len (s.strip()) > 20 ]
# Calculate sentence scores (simple length-based)
sentence_scores = [
{ 'text' : s, 'score' : len (s.split())}
for s in sentences
]
return {
'full_text' : clean_text,
'sentences' : sentence_scores,
'total_sentences' : len (sentences),
'word_count' : len (clean_text.split())
}
Post-Processing Clean Text
Sometimes you may want additional cleaning:
function postProcessCleanText ( text ) {
// Remove extra whitespace
let cleaned = text. replace ( / \s + / g , ' ' );
// Remove leading/trailing whitespace from each line
cleaned = cleaned. split ( ' \n ' )
. map ( line => line. trim ())
. filter ( line => line. length > 0 )
. join ( ' \n ' );
// Remove duplicate blank lines
cleaned = cleaned. replace ( / \n {3,} / g , ' \n\n ' );
return cleaned. trim ();
}
async function getSuperCleanText ( url ) {
const cleanText = await extractCleanText (url);
return postProcessCleanText (cleanText);
}
Clean text extraction also works with file uploads:
async function extractCleanTextFromFile ( file ) {
const formData = new FormData ();
formData. append ( 'file' , file);
const response = await fetch ( 'https://api.crawler.dev/v1/extract/file' , {
method: 'POST' ,
headers: {
'x-api-key' : 'YOUR_API_KEY'
},
body: formData
});
const result = await response. json ();
// File extraction automatically applies clean text processing
return result.text;
}
Best Practices
Always use cleanText: true for web pages unless you need raw HTML
Post-process if needed for specific formatting requirements
Handle empty results - Some pages may not have extractable content
Validate content - Check that extracted text meets your quality standards
Common Patterns
Extract Main Content Only
async function extractMainContent ( url ) {
const cleanText = await extractCleanText (url);
// Remove very short lines (likely navigation or metadata)
const lines = cleanText. split ( ' \n ' )
. filter ( line => line. trim (). length > 20 )
. join ( ' \n ' );
return lines;
}
async function extractWithMinWords ( url , minWords = 100 ) {
const cleanText = await extractCleanText (url);
const wordCount = cleanText. split ( / \s + / ). length ;
if (wordCount < minWords) {
throw new Error ( `Content too short: ${ wordCount } words (minimum: ${ minWords })` );
}
return cleanText;
}
Next Steps