1. spider文件
from scrapy.contrib.spiders import CrawlSpider, Rulefrom scrapy.contrib.linkextractors.sgml import SgmlLinkExtractorfrom scrapy.selector import HtmlXPathSelectoritem = DomzItem()image_urls = hxs.select('//img/@src').extract()item['image_urls'] = ["http:" + x for x in image_urls]return item
from scrapy.selector import HtmlXPathSelectorhxs = HtmlXPathSelector(response)
class MySpider(CrawlSpider): #控制下载速度 name = 'myspider' download_delay = 2
$ scrapy crawl somespider -s JOBDIR=crawls/somespider-1 #这样开始下载之后可以Ctrl + C停止,恢复下载还是同样的命令 $ scrapy crawl somespider -s JOBDIR=crawls/somespider-1
name = "wikipedia"allowed_domains = ["wikipedia.org"]start_urls = [ "http://en.wikipedia.org/wiki/Pune"]
2. setting文件
ITEM_PIPELINES = ['scrapy.contrib.pipeline.images.ImagesPipeline']IMAGES_STORE= '...'
3. item 文件
image_urls = Field() images = Field()