python - Why does my Scrapy spider not run as expected? -

when run code below, end file has expected data second code block nothing first. in other words, of data eventlocation eventurl present nothing eventartist eventdetails. need modify working correctly?

import urlparse scrapy.http import request scrapy.spider import basespider scrapy.selector import selector #from nt.items import nowtorontoitem scrapy.item import item, field  class nowtorontoitem(item):     eventartist = field()     eventtitle = field()     eventholder = field()     eventdetails = field()     eventlocation = field()     eventorganization = field()     eventname = field()     eventaddress = field()     eventlocality = field()     eventpostalcode = field()     eventphone = field()     eventurl = field()  class myspider(basespider):     name = "ntspider"     allowed_domains = ["nowtoronto.com"]     start_urls = ["http://www.nowtoronto.com/music/listings/"]      def parse(self, response):         selector = selector(response)         listings = selector.css("div.listing-item0, div.listing-item1")          listing in listings:             item = nowtorontoitem()             body in listing.css('span.listing-body > div.list-body'):                 item ["eventartist"] = body.css("span.list-name::text").extract()                 item ["eventtitle"] = body.css("span.list-body-emphasis::text").extract()                 item ["eventholder"] = body.css("span.list-body-strong::text").extract()                 item ["eventdetails"] = body.css("::text").extract()               # yield request()             # scrapy enqueues new page fetch             detail_url = listing.css("div.listing-readmore > a::attr(href)")              if detail_url:                 yield request(urlparse.urljoin(response.url,                                                detail_url.extract()[0]),                               callback=self.parse_details)      def parse_details(self, response):         self.log("parse_details: %r" % response.url)         selector = selector(response)         listings = selector.css("div.whenwherecontent")          listing in listings:             body in listing.css('td.small-txt.dkgrey-txt.rightinfotd'):                 item = nowtorontoitem()                 item ["eventlocation"] = body.css("span[property='v:location']::text").extract()                 item ["eventorganization"] = body.css("span[property='v:organization'] span[property='v:name']::text").extract()                 item ["eventname"] = body.css("span[property='v:name']::text").extract()                 item ["eventaddress"] = body.css("span[property='v:street-address']::text").extract()                 item ["eventlocality"] = body.css("span[property='v:locality']::text").extract()                 item ["eventpostalcode"] = body.css("span[property='v:postal-code']::text").extract()                 item ["eventphone"] = body.css("span[property='v:tel']::text").extract()                 item ["eventurl"] = body.css("span[property='v:url']::text").extract()                 yield item

edit

it appears running there small problem. each event, returns either 2 rows, 1 of details , 1 details pulled first code block or 3 rows, 1 of details , 2 identical rows details pulled first block.

here example of first situation

2014-03-21 11:12:40-0400 [ntspider] debug: parse_details: 'http://www.nowtoronto.com/music/listings/listing.cfm?listingid=129761&subsection=&category=&criticspicks=&date1=&date2=&locationid=0' 2014-03-21 11:12:40-0400 [ntspider] debug: scraped <200 http://www.nowtoronto.com/music/listings/listing.cfm?listingid=129761&subsection=&category=&criticspicks=&date1=&date2=&locationid=0>     {'eventaddress': [u'875 bloor w'],      'eventartist': [u'andria simone & guys'],      'eventdetails': [u'andria simone & guys',                       u' (pop/soul) ',                       u'baltic avenue',                       u' 8 pm, $15.'],      'eventholder': [u'baltic avenue'],      'eventlocality': [u'toronto'],      'eventlocation': [u'\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t'],      'eventname': [u'\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tbaltic avenue'],      'eventorganization': [u'\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tbaltic avenue'],      'eventphone': [u'647-898-5324'],      'eventpostalcode': [u'm6g 3t6'],      'eventtitle': [],      'eventurl': []} 2014-03-21 11:12:40-0400 [ntspider] debug: scraped <200 http://www.nowtoronto.com/music/listings/listing.cfm?listingid=129761&subsection=&category=&criticspicks=&date1=&date2=&locationid=0>     {'eventaddress': [],      'eventartist': [u'andria simone & guys'],      'eventdetails': [u'andria simone & guys',                       u' (pop/soul) ',                       u'baltic avenue',                       u' 8 pm, $15.'],      'eventholder': [u'baltic avenue'],      'eventlocality': [],      'eventlocation': [],      'eventname': [],      'eventorganization': [],      'eventphone': [],      'eventpostalcode': [],      'eventtitle': [],      'eventurl': []}

and here example of second situation

2014-03-21 11:21:23-0400 [ntspider] debug: parse_details: 'http://www.nowtoronto.com/music/listings/listing.cfm?listingid=130096&subsection=&category=&criticspicks=&date1=&date2=&locationid=0' 2014-03-21 11:21:23-0400 [ntspider] debug: scraped <200 http://www.nowtoronto.com/music/listings/listing.cfm?listingid=130096&subsection=&category=&criticspicks=&date1=&date2=&locationid=0>     {'eventaddress': [u'11 polson'],      'eventartist': [u'danny byrd, s.p.y., fred v & grafix, marcus visionary, lushy '],      'eventdetails': [u'danny byrd, s.p.y., fred v & grafix, marcus visionary, lushy ',                       u'bassweek: projek-hospitality ',                       u'sound academy',                       u' $35 or wristband tm.'],      'eventholder': [u'sound academy'],      'eventlocality': [u'toronto'],      'eventlocation': [u'\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t'],      'eventname': [u'\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tsound academy'],      'eventorganization': [u'\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tsound academy'],      'eventphone': [u'416-461-3625'],      'eventpostalcode': [u'm5a 1a4'],      'eventtitle': [u'bassweek: projek-hospitality '],      'eventurl': [u'sound-academy.com']} 2014-03-21 11:21:23-0400 [ntspider] debug: crawled (200) <get http://www.nowtoronto.com/music/listings/listing.cfm?listingid=122291&subsection=&category=&criticspicks=&date1=&date2=&locationid=0> (referer: http://www.nowtoronto.com/music/listings/) 2014-03-21 11:21:24-0400 [ntspider] debug: scraped <200 http://www.nowtoronto.com/music/listings/listing.cfm?listingid=130096&subsection=&category=&criticspicks=&date1=&date2=&locationid=0>     {'eventaddress': [],      'eventartist': [u'danny byrd, s.p.y., fred v & grafix, marcus visionary, lushy '],      'eventdetails': [u'danny byrd, s.p.y., fred v & grafix, marcus visionary, lushy ',                       u'bassweek: projek-hospitality ',                       u'sound academy',                       u' $35 or wristband tm.'],      'eventholder': [u'sound academy'],      'eventlocality': [],      'eventlocation': [],      'eventname': [],      'eventorganization': [],      'eventphone': [],      'eventpostalcode': [],      'eventtitle': [u'bassweek: projek-hospitality '],      'eventurl': []} 2014-03-21 11:21:24-0400 [ntspider] debug: scraped <200 http://www.nowtoronto.com/music/listings/listing.cfm?listingid=130096&subsection=&category=&criticspicks=&date1=&date2=&locationid=0>     {'eventaddress': [],      'eventartist': [u'danny byrd, s.p.y., fred v & grafix, marcus visionary, lushy '],      'eventdetails': [u'danny byrd, s.p.y., fred v & grafix, marcus visionary, lushy ',                       u'bassweek: projek-hospitality ',                       u'sound academy',                       u' $35 or wristband tm.'],      'eventholder': [u'sound academy'],      'eventlocality': [],      'eventlocation': [],      'eventname': [],      'eventorganization': [],      'eventphone': [],      'eventpostalcode': [],      'eventtitle': [u'bassweek: projek-hospitality '],      'eventurl': []}

you should pass item parse() parse_details() in request's meta argument:

yield request(urlparse.urljoin(response.url,               detail_url.extract()[0]),               meta={'item': item},               callback=self.parse_details)

then, in parse_details() can item response.meta['item'] (docs).

also, want yield item if no details found:

if detail_url:     yield request(urlparse.urljoin(response.url,                   detail_url.extract()[0]),                   meta={'item': item},                   callback=self.parse_details) else:     yield item

hope helps.

Search This Blog

Sp

python - Why does my Scrapy spider not run as expected? -

Comments

Post a Comment

Popular posts from this blog

java - WrongTypeOfReturnValue exception thrown when unit testing using mockito -

c++11 - Intel compiler and "cannot have an in-class initializer" when using constexpr -

symfony - imagine_filter() not generating the correct url in LiipImagineBundle -