python - Scrapy not collecting data -


i using scrapy collect emails craiglist , when run it returns blank row in .csv file. able extract title,tag , link. email problem. here code:

 # -*- coding: utf-8 -*- import re import scrapy scrapy.http import request   # item class included here class dmozitem(scrapy.item):     # define fields item here like:     link = scrapy.field()     attr = scrapy.field()     title = scrapy.field()     tag = scrapy.field()  class dmozspider(scrapy.spider):     name = "dmoz"     allowed_domains = ["craigslist.org"]     start_urls = [     "http://raleigh.craigslist.org/bab/5038434567.html"     ]      base_url = 'http://raleigh.craigslist.org/'      def parse(self, response):         links = response.xpath('//a[@class="hdrlnk"]/@href').extract()         link in links:             absolute_url = self.base_url + link             yield scrapy.request(absolute_url, callback=self.parse_attr)      def parse_attr(self, response):         match = re.search(r"(\w+)\.html", response.url)         if match:             item_id = match.group(1)             url = self.base_url + "reply/nos/vgm/" + item_id              item = dmozitem()             item["link"] = response.url             item["title"] = "".join(response.xpath("//span[@class='postingtitletext']//text()").extract())             item["tag"] = "".join(response.xpath("//p[@class='attrgroup']/span/b/text()").extract()[0])             return scrapy.request(url, meta={'item': item}, callback=self.parse_contact)      def parse_contact(self, response):         item = response.meta['item']         item["attr"] = "".join(response.xpath("//div[@class='anonemail']//text()").extract())         return item 

first of all, meant have start_urls @ catalog page: http://raleigh.craigslist.org/search/bab.

also, understand, additional request email should go reply/ral/bab/ instead of reply/nos/vgm/.

also, if there no attr group, getting error on following line:

item["tag"] = "".join(response.xpath("//p[@class='attrgroup']/span/b/text()").extract()[0]) 

replace with:

item["tag"] = "".join(response.xpath("//p[@class='attrgroup']/span/b/text()").extract()) 

the complete code worked me:

# -*- coding: utf-8 -*- import re import scrapy   class dmozitem(scrapy.item):     # define fields item here like:     link = scrapy.field()     attr = scrapy.field()     title = scrapy.field()     tag = scrapy.field()   class dmozspider(scrapy.spider):     name = "dmoz"     allowed_domains = ["raleigh.craigslist.org"]     start_urls = [         "http://raleigh.craigslist.org/search/bab"     ]      base_url = 'http://raleigh.craigslist.org/'      def parse(self, response):         links = response.xpath('//a[@class="hdrlnk"]/@href').extract()         link in links:             absolute_url = self.base_url + link             yield scrapy.request(absolute_url, callback=self.parse_attr)      def parse_attr(self, response):         match = re.search(r"(\w+)\.html", response.url)         if match:             item_id = match.group(1)             url = self.base_url + "reply/ral/bab/" + item_id              item = dmozitem()             item["link"] = response.url             item["title"] = "".join(response.xpath("//span[@class='postingtitletext']//text()").extract())             item["tag"] = "".join(response.xpath("//p[@class='attrgroup']/span/b/text()").extract())             return scrapy.request(url, meta={'item': item}, callback=self.parse_contact)      def parse_contact(self, response):         item = response.meta['item']         item["attr"] = "".join(response.xpath("//div[@class='anonemail']//text()").extract())         return item 

Comments

Popular posts from this blog

powershell Start-Process exit code -1073741502 when used with Credential from a windows service environment -

twig - Using Twigbridge in a Laravel 5.1 Package -

c# - LINQ join Entities from HashSet's, Join vs Dictionary vs HashSet performance -