Examples

Extract Markdown from HTML

Convert HTML web pages to clean markdown format. This is useful for content migration, documentation generation, and creating readable text from web content.

Overview

The crawler.dev API can extract content from web pages and convert it to markdown format, preserving structure like headings, lists, links, and code blocks while removing unnecessary HTML elements.

cURL Example

Code
 
curl -X POST https://api.crawler.dev/v1/extract/url \
  -H "x-api-key: YOUR_API_KEY" \
  -H "Content-Type: application/json" \
  -d '{
    "url": "https://example.com/article",
    "formats": ["text", "markdown"]
  }'

Response:


Code
 
{
  "url": "https://example.com/article",
  "finalUrl": "https://example.com/article",
  "statusCode": 200,
  "contentType": "text/html",
  "size": 2048,
  "text": "Article Title\n\nThis is the article content...",
  "markdown": "# Article Title\n\nThis is the article content in markdown format...\n\n## Section\n\n- List item 1\n- List item 2\n\n[Link text](https://example.com)"
}

JavaScript Example

Basic Markdown Extraction


Code
 
async function extractMarkdownFromUrl(url) {
  const response = await fetch('https://api.crawler.dev/v1/extract/url', {
    method: 'POST',
    headers: {
      'x-api-key': 'YOUR_API_KEY',
      'Content-Type': 'application/json'
    },
    body: JSON.stringify({
      url: url,
      formats: ["text", "markdown"],
      cleanText: true
    })
  });

  if (!response.ok) {
    const error = await response.json();
    throw new Error(error.error?.message || 'Failed to extract markdown');
  }

  const result = await response.json();
  return result.markdown;
}

// Usage
extractMarkdownFromUrl('https://example.com/article')
  .then(markdown => {
    console.log('Markdown content:');
    console.log(markdown);
  })
  .catch(error => {
    console.error('Error:', error.message);
  });

Save Markdown to File


Code
 
const fs = require('fs');
const path = require('path');

async function extractAndSaveMarkdown(url, outputPath) {
  try {
    const markdown = await extractMarkdownFromUrl(url);
    
    // Ensure directory exists
    const dir = path.dirname(outputPath);
    if (!fs.existsSync(dir)) {
      fs.mkdirSync(dir, { recursive: true });
    }
    
    // Write markdown to file
    fs.writeFileSync(outputPath, markdown, 'utf8');
    console.log(`Markdown saved to ${outputPath}`);
    
    return markdown;
  } catch (error) {
    console.error('Error extracting markdown:', error.message);
    throw error;
  }
}

// Usage
extractAndSaveMarkdown('https://example.com/article', './output/article.md');

Python Example

Basic Markdown Extraction


Code
 
import requests
import os

def extract_markdown_from_url(url):
    """
    Extract markdown from a webpage using the crawler.dev API.
    
    Args:
        url: URL of the webpage to convert to markdown
        
    Returns:
        str: Markdown content
    """
    api_key = os.getenv('CRAWLER_API_KEY')
    
    if not api_key:
        raise ValueError("CRAWLER_API_KEY environment variable not set")
    
    endpoint = "https://api.crawler.dev/v1/extract/url"
    headers = {
        "x-api-key": api_key,
        "Content-Type": "application/json"
    }
    
    payload = {
        "url": url,
        "formats": ["text", "markdown"],
        "cleanText": True
    }
    
    response = requests.post(endpoint, headers=headers, json=payload)
    response.raise_for_status()
    
    result = response.json()
    return result['markdown']

# Usage
try:
    markdown = extract_markdown_from_url('https://example.com/article')
    print(markdown)
except requests.exceptions.RequestException as e:
    print(f"Error: {e}")

Save Markdown to File


Code
 
import os
from pathlib import Path

def extract_and_save_markdown(url, output_path):
    """
    Extract markdown from URL and save to file.
    
    Args:
        url: URL to extract markdown from
        output_path: Path to save markdown file
    """
    markdown = extract_markdown_from_url(url)
    
    # Ensure directory exists
    output_dir = Path(output_path).parent
    output_dir.mkdir(parents=True, exist_ok=True)
    
    # Write markdown to file
    with open(output_path, 'w', encoding='utf-8') as f:
        f.write(markdown)
    
    print(f"Markdown saved to {output_path}")
    return markdown

# Usage
extract_and_save_markdown('https://example.com/article', './output/article.md')

Converting HTML Files to Markdown

If you have an HTML file, you can upload it and extract markdown:


Code
 
async function htmlFileToMarkdown(file) {
  const formData = new FormData();
  formData.append('file', file);
  // For multipart/form-data, send formats as separate entries
  formData.append('formats', 'text');
  formData.append('formats', 'markdown');

  const response = await fetch('https://api.crawler.dev/v1/extract/file', {
    method: 'POST',
    headers: {
      'x-api-key': 'YOUR_API_KEY'
    },
    body: formData
  });

  if (!response.ok) {
    const error = await response.json();
    throw new Error(error.error?.message || 'Failed to extract markdown');
  }

  const result = await response.json();
  // Returns both text and markdown fields
  return result.markdown;
}

Use Cases

Documentation Migration


Code
 
def migrate_docs_to_markdown(urls, output_dir):
    """
    Migrate multiple documentation pages to markdown format.
    
    Args:
        urls: List of documentation URLs
        output_dir: Directory to save markdown files
    """
    import os
    from urllib.parse import urlparse
    
    os.makedirs(output_dir, exist_ok=True)
    
    for url in urls:
        try:
            markdown = extract_markdown_from_url(url)
            
            # Create filename from URL
            parsed = urlparse(url)
            filename = parsed.path.strip('/').replace('/', '_') + '.md'
            if not filename or filename == '.md':
                filename = 'index.md'
            
            output_path = os.path.join(output_dir, filename)
            with open(output_path, 'w', encoding='utf-8') as f:
                f.write(markdown)
            
            print(f"✓ Converted {url} -> {output_path}")
        except Exception as e:
            print(f"✗ Failed to convert {url}: {e}")

# Usage
docs_urls = [
    'https://example.com/docs/getting-started',
    'https://example.com/docs/api-reference',
    'https://example.com/docs/examples'
]
migrate_docs_to_markdown(docs_urls, './markdown-docs')

Content Analysis


Code
 
async function analyzeMarkdownContent(url) {
  const markdown = await extractMarkdownFromUrl(url);
  
  // Count markdown elements
  const headings = (markdown.match(/^#+\s/gm) || []).length;
  const links = (markdown.match(/\[([^\]]+)\]\(([^)]+)\)/g) || []).length;
  const codeBlocks = (markdown.match(/```[\s\S]*?```/g) || []).length;
  const lists = (markdown.match(/^[-*+]\s/gm) || []).length;
  
  return {
    totalLength: markdown.length,
    headings,
    links,
    codeBlocks,
    lists,
    wordCount: markdown.split(/\s+/).length
  };
}

Markdown Format Features

The extracted markdown preserves:

Headings: # H1, ## H2, etc.
Links: [text](url)
Lists: Ordered and unordered
Code blocks: Fenced code blocks with syntax highlighting
Bold/Italic: **bold** and *italic*
Blockquotes: > quote

Tips

Use cleanText: true for better markdown quality
Handle errors - Some pages may not convert perfectly
Post-process if needed - You may want to clean up the markdown further
Batch processing - Use batch endpoints for multiple URLs

Next Steps

Learn about batch processing for multiple conversions
Explore clean text extraction options
Check out the API Reference for complete documentation

Extract Text from Webpage Batch Processing

Examples

Extract Markdown from HTML

Convert HTML web pages to clean markdown format. This is useful for content migration, documentation generation, and creating readable text from web content.

Overview

The crawler.dev API can extract content from web pages and convert it to markdown format, preserving structure like headings, lists, links, and code blocks while removing unnecessary HTML elements.

cURL Example

Code
 
curl -X POST https://api.crawler.dev/v1/extract/url \
  -H "x-api-key: YOUR_API_KEY" \
  -H "Content-Type: application/json" \
  -d '{
    "url": "https://example.com/article",
    "formats": ["text", "markdown"]
  }'

Response:


Code
 
{
  "url": "https://example.com/article",
  "finalUrl": "https://example.com/article",
  "statusCode": 200,
  "contentType": "text/html",
  "size": 2048,
  "text": "Article Title\n\nThis is the article content...",
  "markdown": "# Article Title\n\nThis is the article content in markdown format...\n\n## Section\n\n- List item 1\n- List item 2\n\n[Link text](https://example.com)"
}

JavaScript Example

Basic Markdown Extraction


Code
 
async function extractMarkdownFromUrl(url) {
  const response = await fetch('https://api.crawler.dev/v1/extract/url', {
    method: 'POST',
    headers: {
      'x-api-key': 'YOUR_API_KEY',
      'Content-Type': 'application/json'
    },
    body: JSON.stringify({
      url: url,
      formats: ["text", "markdown"],
      cleanText: true
    })
  });

  if (!response.ok) {
    const error = await response.json();
    throw new Error(error.error?.message || 'Failed to extract markdown');
  }

  const result = await response.json();
  return result.markdown;
}

// Usage
extractMarkdownFromUrl('https://example.com/article')
  .then(markdown => {
    console.log('Markdown content:');
    console.log(markdown);
  })
  .catch(error => {
    console.error('Error:', error.message);
  });

Save Markdown to File


Code
 
const fs = require('fs');
const path = require('path');

async function extractAndSaveMarkdown(url, outputPath) {
  try {
    const markdown = await extractMarkdownFromUrl(url);
    
    // Ensure directory exists
    const dir = path.dirname(outputPath);
    if (!fs.existsSync(dir)) {
      fs.mkdirSync(dir, { recursive: true });
    }
    
    // Write markdown to file
    fs.writeFileSync(outputPath, markdown, 'utf8');
    console.log(`Markdown saved to ${outputPath}`);
    
    return markdown;
  } catch (error) {
    console.error('Error extracting markdown:', error.message);
    throw error;
  }
}

// Usage
extractAndSaveMarkdown('https://example.com/article', './output/article.md');

Python Example

Basic Markdown Extraction


Code
 
import requests
import os

def extract_markdown_from_url(url):
    """
    Extract markdown from a webpage using the crawler.dev API.
    
    Args:
        url: URL of the webpage to convert to markdown
        
    Returns:
        str: Markdown content
    """
    api_key = os.getenv('CRAWLER_API_KEY')
    
    if not api_key:
        raise ValueError("CRAWLER_API_KEY environment variable not set")
    
    endpoint = "https://api.crawler.dev/v1/extract/url"
    headers = {
        "x-api-key": api_key,
        "Content-Type": "application/json"
    }
    
    payload = {
        "url": url,
        "formats": ["text", "markdown"],
        "cleanText": True
    }
    
    response = requests.post(endpoint, headers=headers, json=payload)
    response.raise_for_status()
    
    result = response.json()
    return result['markdown']

# Usage
try:
    markdown = extract_markdown_from_url('https://example.com/article')
    print(markdown)
except requests.exceptions.RequestException as e:
    print(f"Error: {e}")

Save Markdown to File


Code
 
import os
from pathlib import Path

def extract_and_save_markdown(url, output_path):
    """
    Extract markdown from URL and save to file.
    
    Args:
        url: URL to extract markdown from
        output_path: Path to save markdown file
    """
    markdown = extract_markdown_from_url(url)
    
    # Ensure directory exists
    output_dir = Path(output_path).parent
    output_dir.mkdir(parents=True, exist_ok=True)
    
    # Write markdown to file
    with open(output_path, 'w', encoding='utf-8') as f:
        f.write(markdown)
    
    print(f"Markdown saved to {output_path}")
    return markdown

# Usage
extract_and_save_markdown('https://example.com/article', './output/article.md')

Converting HTML Files to Markdown

If you have an HTML file, you can upload it and extract markdown:


Code
 
async function htmlFileToMarkdown(file) {
  const formData = new FormData();
  formData.append('file', file);
  // For multipart/form-data, send formats as separate entries
  formData.append('formats', 'text');
  formData.append('formats', 'markdown');

  const response = await fetch('https://api.crawler.dev/v1/extract/file', {
    method: 'POST',
    headers: {
      'x-api-key': 'YOUR_API_KEY'
    },
    body: formData
  });

  if (!response.ok) {
    const error = await response.json();
    throw new Error(error.error?.message || 'Failed to extract markdown');
  }

  const result = await response.json();
  // Returns both text and markdown fields
  return result.markdown;
}

Use Cases

Documentation Migration


Code
 
def migrate_docs_to_markdown(urls, output_dir):
    """
    Migrate multiple documentation pages to markdown format.
    
    Args:
        urls: List of documentation URLs
        output_dir: Directory to save markdown files
    """
    import os
    from urllib.parse import urlparse
    
    os.makedirs(output_dir, exist_ok=True)
    
    for url in urls:
        try:
            markdown = extract_markdown_from_url(url)
            
            # Create filename from URL
            parsed = urlparse(url)
            filename = parsed.path.strip('/').replace('/', '_') + '.md'
            if not filename or filename == '.md':
                filename = 'index.md'
            
            output_path = os.path.join(output_dir, filename)
            with open(output_path, 'w', encoding='utf-8') as f:
                f.write(markdown)
            
            print(f"✓ Converted {url} -> {output_path}")
        except Exception as e:
            print(f"✗ Failed to convert {url}: {e}")

# Usage
docs_urls = [
    'https://example.com/docs/getting-started',
    'https://example.com/docs/api-reference',
    'https://example.com/docs/examples'
]
migrate_docs_to_markdown(docs_urls, './markdown-docs')

Content Analysis


Code
 
async function analyzeMarkdownContent(url) {
  const markdown = await extractMarkdownFromUrl(url);
  
  // Count markdown elements
  const headings = (markdown.match(/^#+\s/gm) || []).length;
  const links = (markdown.match(/\[([^\]]+)\]\(([^)]+)\)/g) || []).length;
  const codeBlocks = (markdown.match(/```[\s\S]*?```/g) || []).length;
  const lists = (markdown.match(/^[-*+]\s/gm) || []).length;
  
  return {
    totalLength: markdown.length,
    headings,
    links,
    codeBlocks,
    lists,
    wordCount: markdown.split(/\s+/).length
  };
}

Markdown Format Features

The extracted markdown preserves:

Headings: # H1, ## H2, etc.
Links: [text](url)
Lists: Ordered and unordered
Code blocks: Fenced code blocks with syntax highlighting
Bold/Italic: **bold** and *italic*
Blockquotes: > quote

Tips

Use cleanText: true for better markdown quality
Handle errors - Some pages may not convert perfectly
Post-process if needed - You may want to clean up the markdown further
Batch processing - Use batch endpoints for multiple URLs

Next Steps

Learn about batch processing for multiple conversions
Explore clean text extraction options
Check out the API Reference for complete documentation

Extract Text from Webpage Batch Processing