Here's how to get started with text extraction using Python:
Code
import osfrom api.crawler.dev_sdks import APICrawlerDevSDKs as CrawlerDevclient = CrawlerDev( api_key=os.environ.get("API_CRAWLER_DEV_SDKS_API_KEY"), # This is the default and can be omitted)# Extract text from a fileresponse = client.extract.from_file( file=b"file content here",)print(response.content_type)# Extract text from a URLresponse = client.extract.from_url( url="https://example.com")print(response.text)
import osfrom pathlib import Pathfrom api.crawler.dev_sdks import APICrawlerDevSDKs as CrawlerDevclient = CrawlerDev( api_key=os.environ.get("API_CRAWLER_DEV_SDKS_API_KEY"),)# Extract text from a PDF file# You can pass a PathLike instance, bytes, or a tuple of (filename, contents, media type)result = client.extract.from_file( file=Path("document.pdf"),)print(result.text)print(f"Content type: {result.content_type}")
Extract Text from Multiple URLs
Code
import osfrom api.crawler.dev_sdks import APICrawlerDevSDKs as CrawlerDevclient = CrawlerDev( api_key=os.environ.get("API_CRAWLER_DEV_SDKS_API_KEY"),)urls = [ "https://example.com/page1", "https://example.com/page2", "https://example.com/page3"]# Extract text from multiple URLsresults = []for url in urls: result = client.extract.from_url(url=url) results.append(result)for i, result in enumerate(results): print(f"Text from {urls[i]}: {result.text}")
Using the Async Client
Code
import osimport asynciofrom api.crawler.dev_sdks import AsyncAPICrawlerDevSDKs as AsyncCrawlerDevasync def main(): client = AsyncCrawlerDev( api_key=os.environ.get("API_CRAWLER_DEV_SDKS_API_KEY"), ) result = await client.extract.from_url( url="https://example.com" ) print(result.text)# Run the async functionasyncio.run(main())
Error Handling
The SDK provides comprehensive error handling:
Code
import osimport api.crawler.dev_sdksfrom api.crawler.dev_sdks import APICrawlerDevSDKs as CrawlerDevclient = CrawlerDev( api_key=os.environ.get("API_CRAWLER_DEV_SDKS_API_KEY"),)try: result = client.extract.from_url(url="https://example.com") print(result.text)except api.crawler.dev_sdks.APIConnectionError as e: print("The server could not be reached") print(e.__cause__) # an underlying Exception, likely raised within httpxexcept api.crawler.dev_sdks.RateLimitError as e: print("A 429 status code was received; we should back off a bit.")except api.crawler.dev_sdks.APIStatusError as e: print("Another non-200-range status code was received") print(e.status_code) print(e.response) if e.status_code == 401: print("Invalid API key") elif e.status_code == 429: print("Rate limit exceeded")