xxxxxxxxxx
def parse_item(self, response):
loader = ItemLoader(EolZhiyeItem(), response)
loader.add_value('url', response.url)
loader.add_value('code', response.url, re=r'/(\w+)\.shtml')
loader.add_css('name', 'h1#pagetitle::text')
loader.add_xpath('category', u'//div[@id="precontent"]/p[contains(., "??")]/a/text()')
loader.add_xpath('category2', u'//div[@id="precontent"]/p[contains(., "??")]/a/text()')
loader.add_xpath('detail', u'//div[@id="precontent"]/following-sibling::node()[not(self::table)]', Join('\n'))
yield loader.load_item()
xxxxxxxxxx
def parse_question(self, response):
# ??question??? ??????????question item
if "QuestionHeader-title" in response.text:
# ?????
match_obj = re.match("(.*zhihu.com/question/(\d+))(/|$).*", response.url)
if match_obj:
question_id = int(match_obj.group(2))
item_loader = ItemLoader(item=ZhihuQuestionItem(), response=response)
item_loader.add_css("title", "h1.QuestionHeader-title::text")
item_loader.add_css("content", ".QuestionHeader-detail")
item_loader.add_value("url", response.url)
item_loader.add_value("zhihu_id", question_id)
item_loader.add_css("answer_num", ".List-headerText span::text")
item_loader.add_css("comments_num", ".QuestionHeader-Comment button::text")
item_loader.add_css("watch_user_num", ".NumberBoard-value::text")
item_loader.add_css("topics", ".QuestionHeader-topics .Popover div::text")
question_item = item_loader.load_item()
else:
# ????????item??
match_obj = re.match("(.*zhihu.com/question/(\d+))(/|$).*", response.url)
if match_obj:
question_id = int(match_obj.group(2))
item_loader = ItemLoader(item=ZhihuQuestionItem(), response=response)
# item_loader.add_css("title", ".zh-question-title h2 a::text")
item_loader.add_xpath("title",
"//*[@id='zh-question-title']/h2/a/text()|//*[@id='zh-question-title']/h2/span/text()")
item_loader.add_css("content", "#zh-question-detail")
item_loader.add_value("url", response.url)
item_loader.add_value("zhihu_id", question_id)
item_loader.add_css("answer_num", "#zh-question-answer-num::text")
item_loader.add_css("comments_num", "#zh-question-meta-wrap a[name='addcomment']::text")
# item_loader.add_css("watch_user_num", "#zh-question-side-header-wrap::text")
item_loader.add_xpath("watch_user_num",
"//*[@id='zh-question-side-header-wrap']/text()|//*[@class='zh-question-followers-sidebar']/div/a/strong/text()")
item_loader.add_css("topics", ".zm-tag-editor-labels a::text")
question_item = item_loader.load_item()
yield scrapy.Request(self.start_answer_url.format(question_id, 20, 0), headers=self.headers,
callback=self.parse_answer)
yield question_item
xxxxxxxxxx
def parse_news_metro(self, response):
loader = ItemLoader(item=News(), response=response)
loader.add_value('url', response.url)
date_selector = response.css('.artikel > div.block-tanggal::text')
if not date_selector:
return self.parse_news_pilkada(loader, response)
try:
date_time_str = date_selector.extract()[0].split(',')[1].strip()[:-4]
date_time_str = ' '.join([_(x) for x in date_time_str.split(' ')])
published_at_wib = datetime.strptime(date_time_str, '%d %B %Y | %H:%M')
except Exception:
return loader.load_item()
published_at = wib_to_utc(published_at_wib)
if (self.media['last_scraped_at'] >= published_at):
is_no_update = True
self.logger.info('Media have no update')
raise CloseSpider('finished')
loader.add_value('published_at', published_at)
title_selector = response.css('.artikel > h1::text')
if not title_selector:
return loader.load_item()
loader.add_value('title', title_selector.extract()[0])
# Select all p which don't have iframe inside it
raw_content_selector = response.xpath('//div[@class="artikel"]//p[not(iframe)]')
if not raw_content_selector:
return loader.load_item()
raw_content = ''
for rsl in raw_content_selector:
raw_content = raw_content + rsl.extract().strip()
# Go to next page while there is next page button
next_page_selector = response.css('.pagination-nb').xpath('//a[text()="next"]/@href')
if next_page_selector:
return Request(next_page_selector.extract()[0], callback=lambda x, loader=loader, raw_content=raw_content: self.parse_next_page_metro(x, loader, raw_content))
loader.add_value('raw_content', raw_content)
# The author usually put inside <strong> tag, however, some news is not using <strong> tag.
# NOTE: this block of code may need revision in the future
author_name = ''
for author_name_selector in reversed(raw_content_selector):
author_name_selector = author_name_selector.css('strong::text')
for tmp in reversed(author_name_selector.extract()):
tmp = tmp.strip()
if tmp and all((x.isalpha() and x.isupper()) or x.isspace() or x == '.' or x == '|' for x in tmp):
author_name = tmp
break
if author_name:
break
author_name = ','.join(author_name.split(' | '))
loader.add_value('author_name', author_name)
return loader.load_item()
xxxxxxxxxx
def parse_item(self, response):
loader = ItemLoader(GaokaopaiZhiyeItem(), response)
loader.add_value('url', response.url)
loader.add_value('code', response.url, re=ur'-([^-]+)\.html')
loader.add_css('name', u'.modTitle>h1::text')
def parse_category():
for e in response.css(u'.catType>a'):
yield {
'url': e.css('::attr(href)').extract_first(),
'code': e.css('::attr(href)').re_first(ur'-([^-]+)\.html'),
'name': e.css('::text').extract_first(),
}
loader.add_value('category', list(parse_category()))
loader.add_css('detail', u'.zhiyeShow')
item = loader.load_item()
return FormRequest(
url='http://www.gaokaopai.com/ajax-career-getRelateMajor.html',
formdata={'code': item['code'][0]},
meta={'item': item},
dont_filter=True,
callback=self.parse_majors
)
xxxxxxxxxx
def parse_book(self, response):
book_loader = ItemLoader(item=BookItem(), response=response)
book_loader.default_input_processor = MapCompose(remove_tags)
book_loader.default_output_processor = TakeFirst()
book_loader.add_xpath("title", "//div[@class='col-sm-6 product_main']/h1")
book_loader.add_xpath("price", "//p[@class='price_color']")
book_loader.add_xpath("upc", "//table[@class='table table-striped']/tr[1]/td")
book_loader.add_xpath("product_type", "//table[@class='table table-striped']/tr[2]/td")
book_loader.add_xpath("tax", "//table[@class='table table-striped']/tr[5]/td")
book_loader.add_xpath("stock", "//table[@class='table table-striped']/tr[6]/td")
book_loader.add_xpath("reviews", "//table[@class='table table-striped']/tr[7]/td")
book_loader.add_xpath("rating", "//p[@class='instock availability']/following-sibling::p/@class")
yield book_loader.load_item()
xxxxxxxxxx
def parse(self, response):
for quote in response.css(".quote"):
loader = ItemLoader(item=QuoteItem(), selector=quote)
loader.add_css("text", ".text")
loader.add_css("by", ".authoor")
loader.add_css("tags", ".tag")
yield loader.load_item()
xxxxxxxxxx
def parse_question(self, response):
# ??question????????????question item
question_id = response.meta.get("zhihu_id", "")
item_loader = ItemLoader(item=ZhihuQuestionItem(), response=response)
item_loader.add_css("title", "h1.QuestionHeader-title::text")
item_loader.add_css("content", ".QuestionHeader-detail")
item_loader.add_value("url", response.url)
item_loader.add_value("zhihu_id", question_id)
item_loader.add_css("answer_num", ".List-headerText span::text")
item_loader.add_css("comments_num", ".QuestionHeader-actions button::text")
item_loader.add_css("watch_user_num", ".NumberBoard-value::text")
item_loader.add_css("topics", ".QuestionHeader-topics .Popover div::text")
question_item = item_loader.load_item()
yield scrapy.Request(self.start_answer_url.format(question_id, 20, 0), headers=self.headers, callback=self.parse_answer)
yield question_item
xxxxxxxxxx
def _extract_item(self, response):
#?scrapy shell???response
#inspect_response(response, self)
#???????scrapy????response?????????????
#open_in_browser(response)
#???????
l = ItemLoader(response=response, item=MyspiderItem(), type='html')
l.add_xpath('movie_name', '//h1/span[@property="v:itemreviewed"]/text()')
l.add_xpath('movie_year', '//span[@property="v:initialReleaseDate"]/text()')
l.add_xpath('movie_type', '//span[@property="v:genre"]/text()')
l.add_xpath('movie_rate', '//strong[@class="ll rating_num"]/text()')
l.add_value('url', response.url)
#????????????load_item()????scrapy.Item??
#?scrapy-redis????json?item???????redis?item???
#??json?????python?????????????item?????
return dict(l.load_item())
xxxxxxxxxx
def parse_first_page(self, response):
count = int(response.xpath('//div[@id="aplist"]/ul/li[1]/a/text()')[0].re(r'.*?(\d+).*?')[0])
title = response.request.cookies['title']
albumURL = response.url.replace(".html", '')
for x in xrange(1,count+1):
suffix = ".html"
if x > 1:
suffix = "_"+str(x)+".html"
request = scrapy.Request(albumURL+suffix, callback=self.parse_item, cookies={'title': title})
yield request
l = ItemLoader(item=PageItem(), response=response)
l.add_value('title', title)
l.add_value('name', self.name)
l.add_value('url', response.url)
l.add_xpath('image_urls', '//p[@id="contents"]/a/img/@src')
yield l.load_item()
xxxxxxxxxx
def parse_first_page(self, response):
count = int(response.xpath('//ul[@class="image"]/text()')[0].re(r'.*?(\d+).*?')[0])
title = response.request.cookies['title']
albumURL = response.url.replace(".shtml", '')
# print u'', count, title, albumURL
for x in xrange(1,count+1):
suffix = ".shtml"
if x > 1:
suffix = "_"+str(x)+".shtml"
# print u'',albumURL+suffix
request = scrapy.Request(albumURL+suffix, callback=self.parse_item, cookies={'title': title})
yield request
l = ItemLoader(item=PageItem(), response=response)
l.add_value('title', title)
l.add_value('name', self.name)
l.add_value('url', response.url)
l.add_xpath('image_urls', '//td[@valign="top"]/img/@src')
yield l.load_item()