[PYTHON] 58 The same castle

beijing.py

beijing.py


# -*- coding: utf-8 -*-
import scrapy
from zufang.items import ZufangItem

class BeijingSpider(scrapy.Spider):
    name = "beijing"
    allowed_domains = ["58.com"]
    start_urls = ['http://bj.58.com/chuzu/']

    def parse(self, response):
        item = ZufangItem()
        for i in response.css('.des'):
            item['title'] = ''.join(i.css('h2>a::text').extract()).encode('utf-8').strip()
            item['weburl'] = i.css('h2>a::attr(href)').extract_first()
            item['addres'] = i.css('.add>a:first-child::text').extract_first()
            pending_str = i.css('.room::text').extract_first()
            try:
                item['roomtype'] = pending_str.split()[0]
                item['size'] = pending_str.split()[1]
            except:
                item['roomtype'] = ''
                item['size'] = ''
            x = i.css('.jjr')
            if len(x) == 0:
                item['contacts'] = ''
            else:
                shopname =  x.css('span>span::text').extract_first().strip()
                person = x.css('.listjjr>a::text').extract_first()
                item['contacts'] = person
                item['company'] = shopname
            item['price'] = i.xpath('following-sibling::div[@class="listliright"]/div[@class="money"]/b/text()').extract_first()
            yield item
        next_page = response.css('.next::attr(href)').extract_first()
        if next_page:
            yield scrapy.Request(next_page,callback=self.parse)

items.py

items.py


import scrapy

class ZufangItem(scrapy.Item):
    title = scrapy.Field()
    addres = scrapy.Field()
    size = scrapy.Field()
    roomtype = scrapy.Field()
    contacts = scrapy.Field()
    price = scrapy.Field()
    company = scrapy.Field()
    weburl = scrapy.Field()

pipelines.py

pipelies.py


import codecs
import json

class ZufangPipeline(object):
    def __init__(self):
        self.file = codecs.open('zufang.json','wb',encoding='utf-8')

    def process_item(self, item, spider):
        line = json.dumps(dict(item)) + '\n'
        self.file.write(line.decode('unicode_escape'))
        return item

Recommended Posts

58 The same castle
[python] Permutation generation considering the same elements
Hashing algorithm for determining the same image
Python open and io.open are the same
Access files in the same directory as the executable
(Note) Importing Excel with the same column name
A program that searches for the same image
Detect folders with the same image in ImageHash
Building multiple Python environments on the same system
Loop variables at the same time in the template