47 lines
1.3 KiB
Python
47 lines
1.3 KiB
Python
import scrapy
|
|
from scrapy.crawler import CrawlerRunner
|
|
from scrapy.utils.log import configure_logging
|
|
from crochet import setup, wait_for
|
|
from twisted.internet import defer
|
|
from pydispatch import dispatcher
|
|
from scrapy import signals
|
|
|
|
setup() # Initialize Crochet
|
|
|
|
class SmsSpider(scrapy.Spider):
|
|
name = "sms"
|
|
|
|
def __init__(self, api_url=None, *args, **kwargs):
|
|
super(SmsSpider, self).__init__(*args, **kwargs)
|
|
self.api_url = api_url
|
|
self.messages = []
|
|
|
|
def start_requests(self):
|
|
yield scrapy.Request(url=self.api_url, callback=self.parse)
|
|
|
|
def parse(self, response):
|
|
self.messages = response.json()
|
|
for message in self.messages:
|
|
yield message # Yield each message individually
|
|
|
|
@wait_for(timeout=10.0) # Timeout for blocking call
|
|
@defer.inlineCallbacks
|
|
def fetch_messages(api_url):
|
|
configure_logging({'LOG_FORMAT': '%(levelname)s: %(message)s'})
|
|
runner = CrawlerRunner()
|
|
spider_cls = SmsSpider
|
|
|
|
output = []
|
|
|
|
# Signal handler to collect items
|
|
def collect_items(item, response, spider):
|
|
output.append(item)
|
|
|
|
dispatcher.connect(collect_items, signal=signals.item_scraped)
|
|
|
|
yield runner.crawl(spider_cls, api_url=api_url)
|
|
defer.returnValue(output)
|
|
|
|
def run_fetch_messages(api_url):
|
|
return fetch_messages(api_url)
|