SDKs
Java SDK
Text Extraction with Java
Learn how to extract text from any kind of file or URL with the crawler.dev Java SDK.
Prerequisites
To get the most out of this guide, you'll need to:
Installation
Add the crawler.dev Java SDK to your project using Maven:
Code<dependency> <groupId>com.CONFIGURE_ME_crawlerdev.api</groupId> <artifactId>crawler-dev-java</artifactId> <version>0.1.0</version> </dependency>
Or using Gradle:
Codeimplementation("com.CONFIGURE_ME_crawlerdev.api:crawler-dev-java:0.1.0")
Quick Start
Here's how to get started with text extraction using Java:
Codeimport com.configure_me_crawlerdev.api.client.CrawlerDevClient; import com.configure_me_crawlerdev.api.client.okhttp.CrawlerDevOkHttpClient; import com.configure_me_crawlerdev.api.models.files.FileExtractTextParams; import com.configure_me_crawlerdev.api.models.files.FileExtractTextResponse; import java.io.ByteArrayInputStream; // Configures using the `crawlerdev.apiKey` and `crawlerdev.baseUrl` system properties // Or configures using the `CRAWLER_DEV_API_KEY` and `CRAWLER_DEV_BASE_URL` environment variables CrawlerDevClient client = CrawlerDevOkHttpClient.fromEnv(); FileExtractTextParams params = FileExtractTextParams.builder() .file(new ByteArrayInputStream("file content".getBytes())) .build(); FileExtractTextResponse response = client.files().extractText(params); System.out.println(response.getContentType());
Features
- Enterprise-grade functionality with comprehensive error handling
- Full type safety with comprehensive JavaDoc
- Automatic retries and error handling
- Support for Java 8+
- Built-in request/response validation
- Async support with CompletableFuture
- Enterprise-ready with comprehensive logging
Repository
Examples
Extract Text from a File
Codeimport com.configure_me_crawlerdev.api.client.CrawlerDevClient; import com.configure_me_crawlerdev.api.client.okhttp.CrawlerDevOkHttpClient; import com.configure_me_crawlerdev.api.models.files.FileExtractTextParams; import com.configure_me_crawlerdev.api.models.files.FileExtractTextResponse; import java.nio.file.Paths; public class FileExample { public static void main(String[] args) { // Configures using environment variables or system properties CrawlerDevClient client = CrawlerDevOkHttpClient.fromEnv(); // Extract text from a file // You can pass a Path, InputStream, or byte array FileExtractTextParams params = FileExtractTextParams.builder() .file(Paths.get("document.pdf")) .build(); FileExtractTextResponse response = client.files().extractText(params); System.out.println(response.getText()); System.out.println("Content type: " + response.getContentType()); } }
Extract Text from Multiple URLs Concurrently
Codeimport com.configure_me_crawlerdev.api.client.CrawlerDevClient; import com.configure_me_crawlerdev.api.client.okhttp.CrawlerDevOkHttpClient; import com.configure_me_crawlerdev.api.models.urls.UrlExtractTextParams; import com.configure_me_crawlerdev.api.models.urls.UrlExtractTextResponse; import java.util.Arrays; import java.util.List; import java.util.concurrent.CompletableFuture; import java.util.concurrent.ExecutionException; public class ConcurrentExample { public static void main(String[] args) throws ExecutionException, InterruptedException { CrawlerDevClient client = CrawlerDevOkHttpClient.fromEnv(); List<String> urls = Arrays.asList( "https://example.com/page1", "https://example.com/page2", "https://example.com/page3" ); // Extract text from multiple URLs concurrently using async client List<CompletableFuture<UrlExtractTextResponse>> futures = urls.stream() .map(url -> { UrlExtractTextParams params = UrlExtractTextParams.builder() .url(url) .build(); return client.async().urls().extractText(params); }) .toList(); // Wait for all requests to complete CompletableFuture.allOf(futures.toArray(new CompletableFuture[0])).join(); for (int i = 0; i < futures.size(); i++) { try { UrlExtractTextResponse response = futures.get(i).get(); System.out.println("Text from " + urls.get(i) + ": " + response.getText()); } catch (Exception e) { System.err.println("Error extracting from " + urls.get(i) + ": " + e.getMessage()); } } } }
Using Async Methods
Codeimport com.configure_me_crawlerdev.api.client.CrawlerDevClient; import com.configure_me_crawlerdev.api.client.okhttp.CrawlerDevOkHttpClient; import com.configure_me_crawlerdev.api.models.urls.UrlExtractTextParams; import com.configure_me_crawlerdev.api.models.urls.UrlExtractTextResponse; import java.util.concurrent.CompletableFuture; import java.util.concurrent.ExecutionException; public class AsyncExample { public static void main(String[] args) throws ExecutionException, InterruptedException { CrawlerDevClient client = CrawlerDevOkHttpClient.fromEnv(); UrlExtractTextParams params = UrlExtractTextParams.builder() .url("https://example.com") .build(); // Extract text asynchronously using async() method CompletableFuture<UrlExtractTextResponse> future = client.async().urls().extractText(params); // Do other work while the request is processing System.out.println("Processing request..."); // Get the result when ready UrlExtractTextResponse response = future.get(); System.out.println(response.getText()); } }
Error Handling
The SDK provides comprehensive error handling:
Codeimport com.configure_me_crawlerdev.api.client.CrawlerDevClient; import com.configure_me_crawlerdev.api.client.okhttp.CrawlerDevOkHttpClient; import com.configure_me_crawlerdev.api.models.urls.UrlExtractTextParams; import com.configure_me_crawlerdev.api.exceptions.CrawlerDevException; public class ErrorHandlingExample { public static void main(String[] args) { CrawlerDevClient client = CrawlerDevOkHttpClient.fromEnv(); UrlExtractTextParams params = UrlExtractTextParams.builder() .url("https://example.com") .build(); try { var response = client.urls().extractText(params); System.out.println(response.getText()); } catch (CrawlerDevException e) { int statusCode = e.getStatusCode(); switch (statusCode) { case 401: System.err.println("Invalid API key"); break; case 429: System.err.println("Rate limit exceeded"); break; default: System.err.println("API error: " + e.getMessage()); } } catch (Exception e) { System.err.println("An error occurred: " + e.getMessage()); } } }
