SDKs

PHP SDK

Text Extraction with PHP

Learn how to extract text from any kind of file or URL with the crawler.dev PHP SDK.

Prerequisites

To get the most out of this guide, you'll need to:

Installation

Install the crawler.dev PHP SDK using Composer:

Code
 
composer require crawler-dev/api-sdk-php

Quick Start

Here's how to get started with text extraction using PHP:


Code
 
<?php

require_once 'vendor/autoload.php';

use CrawlerDev\Client;

$client = new Client(apiKey: getenv("CRAWLER_DEV_API_KEY") ?: "My API Key");

// Extract text from a file
$response = $client->files->extractText(file: 'file');
var_dump($response->contentType);

// Extract text from a URL
$response = $client->urls->extractText(url: 'https://example.com');
echo $response->text;

Features

Modern PHP SDK with PSR compliance
Comprehensive documentation
Full type hints support
Automatic retry logic
Exception-based error handling
Guzzle HTTP client integration

Repository

GitHub: https://github.com/crawler-dot-dev/api-sdk-php

Examples

Extract Text from a File


Code
 
<?php

require_once 'vendor/autoload.php';

use CrawlerDev\Client;

$client = new Client(apiKey: getenv("CRAWLER_DEV_API_KEY"));

// Extract text from a file
// The file parameter can be a file path, file resource, or file contents
$response = $client->files->extractText(file: 'document.pdf');

echo $response->text;
echo "\nContent type: " . $response->contentType;

Extract Text from Multiple URLs


Code
 
<?php

require_once 'vendor/autoload.php';

use CrawlerDev\Client;

$client = new Client(apiKey: getenv("CRAWLER_DEV_API_KEY"));

$urls = [
    'https://example.com/page1',
    'https://example.com/page2',
    'https://example.com/page3'
];

foreach ($urls as $url) {
    try {
        $result = $client->urls->extractText(url: $url);
        echo "Text from {$url}: " . $result->text . "\n";
    } catch (Exception $e) {
        echo "Error extracting from {$url}: " . $e->getMessage() . "\n";
    }
}

Using Request Options


Code
 
<?php

require_once 'vendor/autoload.php';

use CrawlerDev\Client;
use CrawlerDev\RequestOptions;

$client = new Client(apiKey: getenv("CRAWLER_DEV_API_KEY"));

// Configure retries per-request
$result = $client->files->extractText(
    file: 'document.pdf',
    requestOptions: RequestOptions::with(maxRetries: 5)
);

echo $result->text;

Error Handling

The SDK provides comprehensive error handling:


Code
 
<?php

require_once 'vendor/autoload.php';

use CrawlerDev\Client;
use CrawlerDev\Core\Exceptions\APIConnectionException;
use CrawlerDev\Core\Exceptions\RateLimitException;
use CrawlerDev\Core\Exceptions\APIStatusException;

$client = new Client(apiKey: getenv("CRAWLER_DEV_API_KEY"));

try {
    $result = $client->urls->extractText(url: 'https://example.com');
    echo $result->text;
} catch (APIConnectionException $e) {
    echo "The server could not be reached\n";
    var_dump($e->getPrevious());
} catch (RateLimitException $e) {
    echo "A 429 status code was received; we should back off a bit.\n";
} catch (APIStatusException $e) {
    echo "Another non-200-range status code was received\n";
    echo $e->getMessage();
    
    if ($e->getStatusCode() === 401) {
        echo "\nInvalid API key\n";
    } elseif ($e->getStatusCode() === 429) {
        echo "\nRate limit exceeded\n";
    }
} catch (Exception $e) {
    echo "An error occurred: " . $e->getMessage() . "\n";
}

Error codes are as follows:

Cause	Error Type
HTTP 400	`BadRequestException`
HTTP 401	`AuthenticationException`
HTTP 403	`PermissionDeniedException`
HTTP 404	`NotFoundException`
HTTP 409	`ConflictException`
HTTP 422	`UnprocessableEntityException`
HTTP 429	`RateLimitException`
HTTP >= 500	`InternalServerException`
Other HTTP error	`APIStatusException`
Timeout	`APITimeoutException`
Network error	`APIConnectionException`

Go Java

Quick Start

Here's how to get started with text extraction using PHP:

Code

<?php

require_once 'vendor/autoload.php';

use CrawlerDev\Client;

$client = new Client(apiKey: getenv("CRAWLER_DEV_API_KEY") ?: "My API Key");

// Extract text from a file
$response = $client->files->extractText(file: 'file');
var_dump($response->contentType);

// Extract text from a URL
$response = $client->urls->extractText(url: 'https://example.com');
echo $response->text;

Examples

Extract Text from a File

Code

<?php

require_once 'vendor/autoload.php';

use CrawlerDev\Client;

$client = new Client(apiKey: getenv("CRAWLER_DEV_API_KEY"));

// Extract text from a file
// The file parameter can be a file path, file resource, or file contents
$response = $client->files->extractText(file: 'document.pdf');

echo $response->text;
echo "\nContent type: " . $response->contentType;

Extract Text from Multiple URLs

Code

<?php

require_once 'vendor/autoload.php';

use CrawlerDev\Client;

$client = new Client(apiKey: getenv("CRAWLER_DEV_API_KEY"));

$urls = [
    'https://example.com/page1',
    'https://example.com/page2',
    'https://example.com/page3'
];

foreach ($urls as $url) {
    try {
        $result = $client->urls->extractText(url: $url);
        echo "Text from {$url}: " . $result->text . "\n";
    } catch (Exception $e) {
        echo "Error extracting from {$url}: " . $e->getMessage() . "\n";
    }
}

Using Request Options

Code

<?php

require_once 'vendor/autoload.php';

use CrawlerDev\Client;
use CrawlerDev\RequestOptions;

$client = new Client(apiKey: getenv("CRAWLER_DEV_API_KEY"));

// Configure retries per-request
$result = $client->files->extractText(
    file: 'document.pdf',
    requestOptions: RequestOptions::with(maxRetries: 5)
);

echo $result->text;

Error Handling

The SDK provides comprehensive error handling:

Code

<?php

require_once 'vendor/autoload.php';

use CrawlerDev\Client;
use CrawlerDev\Core\Exceptions\APIConnectionException;
use CrawlerDev\Core\Exceptions\RateLimitException;
use CrawlerDev\Core\Exceptions\APIStatusException;

$client = new Client(apiKey: getenv("CRAWLER_DEV_API_KEY"));

try {
    $result = $client->urls->extractText(url: 'https://example.com');
    echo $result->text;
} catch (APIConnectionException $e) {
    echo "The server could not be reached\n";
    var_dump($e->getPrevious());
} catch (RateLimitException $e) {
    echo "A 429 status code was received; we should back off a bit.\n";
} catch (APIStatusException $e) {
    echo "Another non-200-range status code was received\n";
    echo $e->getMessage();
    
    if ($e->getStatusCode() === 401) {
        echo "\nInvalid API key\n";
    } elseif ($e->getStatusCode() === 429) {
        echo "\nRate limit exceeded\n";
    }
} catch (Exception $e) {
    echo "An error occurred: " . $e->getMessage() . "\n";
}

Error codes are as follows:

Cause

Error Type

HTTP 400

BadRequestException

HTTP 401

AuthenticationException

HTTP 403

PermissionDeniedException

HTTP 404

NotFoundException

HTTP 409

ConflictException

HTTP 422

UnprocessableEntityException

HTTP 429

RateLimitException

HTTP >= 500

InternalServerException

Other HTTP error

APIStatusException

Timeout

APITimeoutException

Network error

APIConnectionException