1.文件结构:
2.lesson.py代码
import scrapyfrom ts.items import TsItemfrom scrapy.http import Requestclass LessonSpider(scrapy.Spider):name = 'lesson'allowed_domains = ['']start_urls = ['/course/1']def parse(self, response):item=TsItem()item['title']=response.xpath("//ol[@class='breadcrumb']/li[@class='active']/text()").extract()item['link'] = response.xpath("//ul[@class='nav nav-tabs']/li[@class='active']/a/@href").extract()item['stu'] = response.xpath("//span[@class='course-view']/text()").extract()yield itemfor i in range(2,121): #控制课程数url='/course/'+str(i)yield Request(url,callback=self.parse)
3.item.py代码
# -*- coding: utf-8 -*-# Define here the models for your scraped items## See documentation in:# /en/latest/topics/items.htmlimport scrapyclass TsItem(scrapy.Item):# define the fields for your item here like:# name = scrapy.Field()title=scrapy.Field()link=scrapy.Field()stu=scrapy.Field()
4.pipelines.py代码
class TsPipeline(object):def __init__(self):self.fh=open("D:/软件(学习)/Python/Test/chapter8/result/ts.txt","a")def process_item(self, item, spider):print(item['title'])print(item['link'])print(item['stu'])print('~~~~~~')self.fh.write(item['title'][0]+"\n"+item['link'][0]+"\n"+item['stu'][0]+"\n"+"~~~~~~~"+"\n")return itemdef close_spider(self):self.fh.close()
5.setting.py代码
BOT_NAME = 'ts'SPIDER_MODULES = ['ts.spiders']NEWSPIDER_MODULE = 'ts.spiders'...ROBOTSTXT_OBEY = True...ITEM_PIPELINES = {'ts.pipelines.TsPipeline': 300,}
TXT文件展示: