csdn博客搬家

# -*- coding: utf-8 -*-
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from scrapy.http.request import Request
import urlparse
from csdn.items import CsdnItem

class csdnSpider( CrawlSpider ):
    name = "csdn"
    allowed_domains = [“www.csdn.net”]
    start_urls = [
            "http://blog.csdn.net/huangxiansheng1980/article/list/1",
            "http://blog.csdn.net/huangxiansheng1980/article/list/2",
            "http://blog.csdn.net/huangxiansheng1980/article/list/3",
            "http://blog.csdn.net/huangxiansheng1980/article/list/4",
            "http://blog.csdn.net/huangxiansheng1980/article/list/5",
            "http://blog.csdn.net/huangxiansheng1980/article/list/6",
            "http://blog.csdn.net/huangxiansheng1980/article/list/7",
            "http://blog.csdn.net/huangxiansheng1980/article/list/8",
            "http://blog.csdn.net/huangxiansheng1980/article/list/9",
            "http://blog.csdn.net/huangxiansheng1980/article/list/10",
            "http://blog.csdn.net/huangxiansheng1980/article/list/11",
            "http://blog.csdn.net/huangxiansheng1980/article/list/12",
            "http://blog.csdn.net/huangxiansheng1980/article/list/13",
            "http://blog.csdn.net/huangxiansheng1980/article/list/14",
            "http://blog.csdn.net/huangxiansheng1980/article/list/15",
            "http://blog.csdn.net/huangxiansheng1980/article/list/16",
            "http://blog.csdn.net/huangxiansheng1980/article/list/17",
            "http://blog.csdn.net/huangxiansheng1980/article/list/18",
            "http://blog.csdn.net/huangxiansheng1980/article/list/19",
            "http://blog.csdn.net/huangxiansheng1980/article/list/20",
            "http://blog.csdn.net/huangxiansheng1980/article/list/21",
            "http://blog.csdn.net/huangxiansheng1980/article/list/22",
            "http://blog.csdn.net/huangxiansheng1980/article/list/23",
            "http://blog.csdn.net/huangxiansheng1980/article/list/24",
            "http://blog.csdn.net/huangxiansheng1980/article/list/25",
            "http://blog.csdn.net/huangxiansheng1980/article/list/26",
            "http://blog.csdn.net/huangxiansheng1980/article/list/27",
            "http://blog.csdn.net/huangxiansheng1980/article/list/28",
            "http://blog.csdn.net/huangxiansheng1980/article/list/29",
            "http://blog.csdn.net/huangxiansheng1980/article/list/30",
            "http://blog.csdn.net/huangxiansheng1980/article/list/31",
            "http://blog.csdn.net/huangxiansheng1980/article/list/32",
            "http://blog.csdn.net/huangxiansheng1980/article/list/33",
            "http://blog.csdn.net/huangxiansheng1980/article/list/34",
            "http://blog.csdn.net/huangxiansheng1980/article/list/35",
            "http://blog.csdn.net/huangxiansheng1980/article/list/36",
            "http://blog.csdn.net/huangxiansheng1980/article/list/37",
            "http://blog.csdn.net/huangxiansheng1980/article/list/38",
            ]
    rules = (
            Rule(SgmlLinkExtractor(allow=('details/12513065',
                    ),), callback='parse_item', follow=True),
            )

    def parse_item( self, response ):
        print '++++++++crawling ' + response.url
        links = []
        hxs = HtmlXPathSelector(response)
        blog_titles = hxs.select('//h3/span/a/text()')
        blog_links = hxs.select('//h3/span/a/@href')
        #next_page_flags = hxs.select('//div[@id=”papelist”]/a/text()')
        #titles = ''
        #links = ''
        #flags = ''
        #for title in blog_titles:
        #    titles = titles + title.extract()
        #for link in blog_links:
        #    print '——–crawling ' + urlparse.urljoin(response.url, link.extract())
        #    yield Request( urlparse.urljoin(response.url, link.extract()), meta={}, callback=self.parse_item )
        #for flag in next_page_flags:
        #    flags = flags + flag.extract()
        filename = response.url.split("/")[-2]
        open( filename, 'wb' ).write( response.body )
        #open( 'all', 'w' ).write( titles.encode( 'utf-8' ))
        #open( 'all', 'a' ).write(  links )
        #open( 'all', 'a' ).write( flags.encode( 'utf-8'))
        item = CsdnItem()
        item.title = 'test'
        item.content = 'fdfjdkf'
        return item


spider = csdnSpider()

版权所有,禁止转载. 如需转载,请先征得博主的同意,并且表明文章出处,否则按侵权处理.

    分享到:

留言

你的邮箱是保密的 必填的信息用*表示