bin-there-done-that/text/sms_spider.py

47 lines
1.3 KiB
Python
Raw Normal View History

import scrapy
from scrapy.crawler import CrawlerRunner
from scrapy.utils.log import configure_logging
from crochet import setup, wait_for
from twisted.internet import defer
from pydispatch import dispatcher
from scrapy import signals
setup() # Initialize Crochet
class SmsSpider(scrapy.Spider):
name = "sms"
def __init__(self, api_url=None, *args, **kwargs):
super(SmsSpider, self).__init__(*args, **kwargs)
self.api_url = api_url
self.messages = []
def start_requests(self):
yield scrapy.Request(url=self.api_url, callback=self.parse)
def parse(self, response):
self.messages = response.json()
for message in self.messages:
yield message # Yield each message individually
@wait_for(timeout=10.0) # Timeout for blocking call
@defer.inlineCallbacks
def fetch_messages(api_url):
configure_logging({'LOG_FORMAT': '%(levelname)s: %(message)s'})
runner = CrawlerRunner()
spider_cls = SmsSpider
output = []
# Signal handler to collect items
def collect_items(item, response, spider):
output.append(item)
dispatcher.connect(collect_items, signal=signals.item_scraped)
yield runner.crawl(spider_cls, api_url=api_url)
defer.returnValue(output)
def run_fetch_messages(api_url):
return fetch_messages(api_url)