Convert HTML web pages to clean markdown format. This is useful for content migration, documentation generation, and creating readable text from web content.
Overview
The crawler.dev API can extract content from web pages and convert it to markdown format, preserving structure like headings, lists, links, and code blocks while removing unnecessary HTML elements.
cURL Example
curl -X POST https://api.crawler.dev/v1/extract/url \
-H "x-api-key: YOUR_API_KEY" \
-H "Content-Type: application/json" \
-d '{
"url": "https://example.com/article",
"formats": ["text", "markdown"]
}'
Response:
{
"url" : "https://example.com/article" ,
"finalUrl" : "https://example.com/article" ,
"statusCode" : 200 ,
"contentType" : "text/html" ,
"size" : 2048 ,
"text" : "Article Title \n\n This is the article content..." ,
"markdown" : "# Article Title \n\n This is the article content in markdown format... \n\n ## Section \n\n - List item 1 \n - List item 2 \n\n [Link text](https://example.com)"
}
JavaScript Example
async function extractMarkdownFromUrl ( url ) {
const response = await fetch ( 'https://api.crawler.dev/v1/extract/url' , {
method: 'POST' ,
headers: {
'x-api-key' : 'YOUR_API_KEY' ,
'Content-Type' : 'application/json'
},
body: JSON . stringify ({
url: url,
formats: [ "text" , "markdown" ],
cleanText: true
})
});
if ( ! response.ok) {
const error = await response. json ();
throw new Error (error.error?.message || 'Failed to extract markdown' );
}
const result = await response. json ();
return result.markdown;
}
// Usage
extractMarkdownFromUrl ( 'https://example.com/article' )
. then ( markdown => {
console. log ( 'Markdown content:' );
console. log (markdown);
})
. catch ( error => {
console. error ( 'Error:' , error.message);
});
Save Markdown to File
const fs = require ( 'fs' );
const path = require ( 'path' );
async function extractAndSaveMarkdown ( url , outputPath ) {
try {
const markdown = await extractMarkdownFromUrl (url);
// Ensure directory exists
const dir = path. dirname (outputPath);
if ( ! fs. existsSync (dir)) {
fs. mkdirSync (dir, { recursive: true });
}
// Write markdown to file
fs. writeFileSync (outputPath, markdown, 'utf8' );
console. log ( `Markdown saved to ${ outputPath }` );
return markdown;
} catch (error) {
console. error ( 'Error extracting markdown:' , error.message);
throw error;
}
}
// Usage
extractAndSaveMarkdown ( 'https://example.com/article' , './output/article.md' );
Python Example
import requests
import os
def extract_markdown_from_url (url):
"""
Extract markdown from a webpage using the crawler.dev API.
Args:
url: URL of the webpage to convert to markdown
Returns:
str: Markdown content
"""
api_key = os.getenv( 'CRAWLER_API_KEY' )
if not api_key:
raise ValueError ( "CRAWLER_API_KEY environment variable not set" )
endpoint = "https://api.crawler.dev/v1/extract/url"
headers = {
"x-api-key" : api_key,
"Content-Type" : "application/json"
}
payload = {
"url" : url,
"formats" : [ "text" , "markdown" ],
"cleanText" : True
}
response = requests.post(endpoint, headers = headers, json = payload)
response.raise_for_status()
result = response.json()
return result[ 'markdown' ]
# Usage
try :
markdown = extract_markdown_from_url( 'https://example.com/article' )
print (markdown)
except requests.exceptions.RequestException as e:
print ( f "Error: { e } " )
Save Markdown to File
import os
from pathlib import Path
def extract_and_save_markdown (url, output_path):
"""
Extract markdown from URL and save to file.
Args:
url: URL to extract markdown from
output_path: Path to save markdown file
"""
markdown = extract_markdown_from_url(url)
# Ensure directory exists
output_dir = Path(output_path).parent
output_dir.mkdir( parents = True , exist_ok = True )
# Write markdown to file
with open (output_path, 'w' , encoding = 'utf-8' ) as f:
f.write(markdown)
print ( f "Markdown saved to { output_path } " )
return markdown
# Usage
extract_and_save_markdown( 'https://example.com/article' , './output/article.md' )
Converting HTML Files to Markdown
If you have an HTML file, you can upload it and extract markdown:
async function htmlFileToMarkdown ( file ) {
const formData = new FormData ();
formData. append ( 'file' , file);
// For multipart/form-data, send formats as separate entries
formData. append ( 'formats' , 'text' );
formData. append ( 'formats' , 'markdown' );
const response = await fetch ( 'https://api.crawler.dev/v1/extract/file' , {
method: 'POST' ,
headers: {
'x-api-key' : 'YOUR_API_KEY'
},
body: formData
});
if ( ! response.ok) {
const error = await response. json ();
throw new Error (error.error?.message || 'Failed to extract markdown' );
}
const result = await response. json ();
// Returns both text and markdown fields
return result.markdown;
}
Use Cases
Documentation Migration
def migrate_docs_to_markdown (urls, output_dir):
"""
Migrate multiple documentation pages to markdown format.
Args:
urls: List of documentation URLs
output_dir: Directory to save markdown files
"""
import os
from urllib.parse import urlparse
os.makedirs(output_dir, exist_ok = True )
for url in urls:
try :
markdown = extract_markdown_from_url(url)
# Create filename from URL
parsed = urlparse(url)
filename = parsed.path.strip( '/' ).replace( '/' , '_' ) + '.md'
if not filename or filename == '.md' :
filename = 'index.md'
output_path = os.path.join(output_dir, filename)
with open (output_path, 'w' , encoding = 'utf-8' ) as f:
f.write(markdown)
print ( f "✓ Converted { url } -> { output_path } " )
except Exception as e:
print ( f "✗ Failed to convert { url } : { e } " )
# Usage
docs_urls = [
'https://example.com/docs/getting-started' ,
'https://example.com/docs/api-reference' ,
'https://example.com/docs/examples'
]
migrate_docs_to_markdown(docs_urls, './markdown-docs' )
Content Analysis
async function analyzeMarkdownContent ( url ) {
const markdown = await extractMarkdownFromUrl (url);
// Count markdown elements
const headings = (markdown. match ( / ^ # + \s / gm ) || []). length ;
const links = (markdown. match ( / \[ ( [ ^ \] ] + ) \]\( ( [ ^ )] + ) \) / g ) || []). length ;
const codeBlocks = (markdown. match ( / ``` [\s\S] *? ``` / g ) || []). length ;
const lists = (markdown. match ( / ^ [-*+]\s / gm ) || []). length ;
return {
totalLength: markdown. length ,
headings,
links,
codeBlocks,
lists,
wordCount: markdown. split ( / \s + / ). length
};
}
Markdown Format Features
The extracted markdown preserves:
Headings : # H1, ## H2, etc.
Links : [text](url)
Lists : Ordered and unordered
Code blocks : Fenced code blocks with syntax highlighting
Bold/Italic : **bold** and *italic*
Blockquotes : > quote
Tips
Use cleanText: true for better markdown quality
Handle errors - Some pages may not convert perfectly
Post-process if needed - You may want to clean up the markdown further
Batch processing - Use batch endpoints for multiple URLs
Next Steps