python - Why does my Scrapy spider not run as expected? -
when run code below, end file has expected data second code block nothing first. in other words, of data eventlocation eventurl present nothing eventartist eventdetails. need modify working correctly?
import urlparse scrapy.http import request scrapy.spider import basespider scrapy.selector import selector #from nt.items import nowtorontoitem scrapy.item import item, field class nowtorontoitem(item): eventartist = field() eventtitle = field() eventholder = field() eventdetails = field() eventlocation = field() eventorganization = field() eventname = field() eventaddress = field() eventlocality = field() eventpostalcode = field() eventphone = field() eventurl = field() class myspider(basespider): name = "ntspider" allowed_domains = ["nowtoronto.com"] start_urls = ["http://www.nowtoronto.com/music/listings/"] def parse(self, response): selector = selector(response) listings = selector.css("div.listing-item0, div.listing-item1") listing in listings: item = nowtorontoitem() body in listing.css('span.listing-body > div.list-body'): item ["eventartist"] = body.css("span.list-name::text").extract() item ["eventtitle"] = body.css("span.list-body-emphasis::text").extract() item ["eventholder"] = body.css("span.list-body-strong::text").extract() item ["eventdetails"] = body.css("::text").extract() # yield request() # scrapy enqueues new page fetch detail_url = listing.css("div.listing-readmore > a::attr(href)") if detail_url: yield request(urlparse.urljoin(response.url, detail_url.extract()[0]), callback=self.parse_details) def parse_details(self, response): self.log("parse_details: %r" % response.url) selector = selector(response) listings = selector.css("div.whenwherecontent") listing in listings: body in listing.css('td.small-txt.dkgrey-txt.rightinfotd'): item = nowtorontoitem() item ["eventlocation"] = body.css("span[property='v:location']::text").extract() item ["eventorganization"] = body.css("span[property='v:organization'] span[property='v:name']::text").extract() item ["eventname"] = body.css("span[property='v:name']::text").extract() item ["eventaddress"] = body.css("span[property='v:street-address']::text").extract() item ["eventlocality"] = body.css("span[property='v:locality']::text").extract() item ["eventpostalcode"] = body.css("span[property='v:postal-code']::text").extract() item ["eventphone"] = body.css("span[property='v:tel']::text").extract() item ["eventurl"] = body.css("span[property='v:url']::text").extract() yield item
edit
it appears running there small problem. each event, returns either 2 rows, 1 of details , 1 details pulled first code block or 3 rows, 1 of details , 2 identical rows details pulled first block.
here example of first situation
2014-03-21 11:12:40-0400 [ntspider] debug: parse_details: 'http://www.nowtoronto.com/music/listings/listing.cfm?listingid=129761&subsection=&category=&criticspicks=&date1=&date2=&locationid=0' 2014-03-21 11:12:40-0400 [ntspider] debug: scraped <200 http://www.nowtoronto.com/music/listings/listing.cfm?listingid=129761&subsection=&category=&criticspicks=&date1=&date2=&locationid=0> {'eventaddress': [u'875 bloor w'], 'eventartist': [u'andria simone & guys'], 'eventdetails': [u'andria simone & guys', u' (pop/soul) ', u'baltic avenue', u' 8 pm, $15.'], 'eventholder': [u'baltic avenue'], 'eventlocality': [u'toronto'], 'eventlocation': [u'\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t'], 'eventname': [u'\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tbaltic avenue'], 'eventorganization': [u'\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tbaltic avenue'], 'eventphone': [u'647-898-5324'], 'eventpostalcode': [u'm6g 3t6'], 'eventtitle': [], 'eventurl': []} 2014-03-21 11:12:40-0400 [ntspider] debug: scraped <200 http://www.nowtoronto.com/music/listings/listing.cfm?listingid=129761&subsection=&category=&criticspicks=&date1=&date2=&locationid=0> {'eventaddress': [], 'eventartist': [u'andria simone & guys'], 'eventdetails': [u'andria simone & guys', u' (pop/soul) ', u'baltic avenue', u' 8 pm, $15.'], 'eventholder': [u'baltic avenue'], 'eventlocality': [], 'eventlocation': [], 'eventname': [], 'eventorganization': [], 'eventphone': [], 'eventpostalcode': [], 'eventtitle': [], 'eventurl': []}
and here example of second situation
2014-03-21 11:21:23-0400 [ntspider] debug: parse_details: 'http://www.nowtoronto.com/music/listings/listing.cfm?listingid=130096&subsection=&category=&criticspicks=&date1=&date2=&locationid=0' 2014-03-21 11:21:23-0400 [ntspider] debug: scraped <200 http://www.nowtoronto.com/music/listings/listing.cfm?listingid=130096&subsection=&category=&criticspicks=&date1=&date2=&locationid=0> {'eventaddress': [u'11 polson'], 'eventartist': [u'danny byrd, s.p.y., fred v & grafix, marcus visionary, lushy '], 'eventdetails': [u'danny byrd, s.p.y., fred v & grafix, marcus visionary, lushy ', u'bassweek: projek-hospitality ', u'sound academy', u' $35 or wristband tm.'], 'eventholder': [u'sound academy'], 'eventlocality': [u'toronto'], 'eventlocation': [u'\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t'], 'eventname': [u'\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tsound academy'], 'eventorganization': [u'\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tsound academy'], 'eventphone': [u'416-461-3625'], 'eventpostalcode': [u'm5a 1a4'], 'eventtitle': [u'bassweek: projek-hospitality '], 'eventurl': [u'sound-academy.com']} 2014-03-21 11:21:23-0400 [ntspider] debug: crawled (200) <get http://www.nowtoronto.com/music/listings/listing.cfm?listingid=122291&subsection=&category=&criticspicks=&date1=&date2=&locationid=0> (referer: http://www.nowtoronto.com/music/listings/) 2014-03-21 11:21:24-0400 [ntspider] debug: scraped <200 http://www.nowtoronto.com/music/listings/listing.cfm?listingid=130096&subsection=&category=&criticspicks=&date1=&date2=&locationid=0> {'eventaddress': [], 'eventartist': [u'danny byrd, s.p.y., fred v & grafix, marcus visionary, lushy '], 'eventdetails': [u'danny byrd, s.p.y., fred v & grafix, marcus visionary, lushy ', u'bassweek: projek-hospitality ', u'sound academy', u' $35 or wristband tm.'], 'eventholder': [u'sound academy'], 'eventlocality': [], 'eventlocation': [], 'eventname': [], 'eventorganization': [], 'eventphone': [], 'eventpostalcode': [], 'eventtitle': [u'bassweek: projek-hospitality '], 'eventurl': []} 2014-03-21 11:21:24-0400 [ntspider] debug: scraped <200 http://www.nowtoronto.com/music/listings/listing.cfm?listingid=130096&subsection=&category=&criticspicks=&date1=&date2=&locationid=0> {'eventaddress': [], 'eventartist': [u'danny byrd, s.p.y., fred v & grafix, marcus visionary, lushy '], 'eventdetails': [u'danny byrd, s.p.y., fred v & grafix, marcus visionary, lushy ', u'bassweek: projek-hospitality ', u'sound academy', u' $35 or wristband tm.'], 'eventholder': [u'sound academy'], 'eventlocality': [], 'eventlocation': [], 'eventname': [], 'eventorganization': [], 'eventphone': [], 'eventpostalcode': [], 'eventtitle': [u'bassweek: projek-hospitality '], 'eventurl': []}
you should pass item parse()
parse_details()
in request
's meta argument:
yield request(urlparse.urljoin(response.url, detail_url.extract()[0]), meta={'item': item}, callback=self.parse_details)
then, in parse_details()
can item response.meta['item']
(docs).
also, want yield
item if no details found:
if detail_url: yield request(urlparse.urljoin(response.url, detail_url.extract()[0]), meta={'item': item}, callback=self.parse_details) else: yield item
hope helps.
Comments
Post a Comment