【scrapy】scrapy使用pycharm打開項目目錄

來源：CSDN 時間：2023-03-17 10:35:08

一創建爬蟲：

(資料圖片僅供參考)

cd到我們存放項目的文件夾然后 scrapy startproject BQG cd：BQG 然后 scrapy genspider biquge biduo.cc

二使用pycharm打開我們的項目目錄

1setting設置

這里我是將我的整個文件貼到這里.

# -*- coding: utf-8 -*-from fake_useragent import UserAgent# Scrapy settings for BQG project## For simplicity, this file contains only settings considered important or# commonly used. You can find more settings consulting the documentation:## https://doc.scrapy.org/en/latest/topics/settings.html# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html# https://doc.scrapy.org/en/latest/topics/spider-middleware.htmlBOT_NAME = "BQG"SPIDER_MODULES = ["BQG.spiders"]NEWSPIDER_MODULE = "BQG.spiders"LOG_LEVEL = "WARNING"# Crawl responsibly by identifying yourself (and your website) on the user-agentUSER_AGENT = UserAgent().chrome# Obey robots.txt rulesROBOTSTXT_OBEY = False# Configure maximum concurrent requests performed by Scrapy (default: 16)#CONCURRENT_REQUESTS = 32# Configure a delay for requests for the same website (default: 0)# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay# See also autothrottle settings and docsDOWNLOAD_DELAY = 3# The download delay setting will honor only one of:#CONCURRENT_REQUESTS_PER_DOMAIN = 16#CONCURRENT_REQUESTS_PER_IP = 16# Disable cookies (enabled by default)#COOKIES_ENABLED = False# Disable Telnet Console (enabled by default)#TELNETCONSOLE_ENABLED = False# Override the default request headers:# DEFAULT_REQUEST_HEADERS = {# "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",# "Accept-Language": "en",# }# Enable or disable spider middlewares# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html#SPIDER_MIDDLEWARES = {# "BQG.middlewares.BqgSpiderMiddleware": 543,#}# Enable or disable downloader middlewares# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#DOWNLOADER_MIDDLEWARES = {# "BQG.middlewares.BqgDownloaderMiddleware": 543,#}# Enable or disable extensions# See https://doc.scrapy.org/en/latest/topics/extensions.html#EXTENSIONS = {# "scrapy.extensions.telnet.TelnetConsole": None,#}# Configure item pipelines# See https://doc.scrapy.org/en/latest/topics/item-pipeline.htmlITEM_PIPELINES = { "BQG.pipelines.BqgPipeline": 300,}# Enable and configure the AutoThrottle extension (disabled by default)# See https://doc.scrapy.org/en/latest/topics/autothrottle.html#AUTOTHROTTLE_ENABLED = True# The initial download delay#AUTOTHROTTLE_START_DELAY = 5# The maximum download delay to be set in case of high latencies#AUTOTHROTTLE_MAX_DELAY = 60# The average number of requests Scrapy should be sending in parallel to# each remote server#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0# Enable showing throttling stats for every response received:#AUTOTHROTTLE_DEBUG = False# Enable and configure HTTP caching (disabled by default)# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings#HTTPCACHE_ENABLED = True#HTTPCACHE_EXPIRATION_SECS = 0#HTTPCACHE_DIR = "httpcache"#HTTPCACHE_IGNORE_HTTP_CODES = []#HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage"

2 spider文件寫法

這里我們簡單分析：

（1）parse函書獲取首頁的鏈接，在這里我們獲取小說分類的鏈接url（https://www.biduo.cc/book_1_1/），小說章節頁面的url.小說分類頁面的鏈接（https://www.biduo.cc/biquge/56_56606/），我們繼續返回parse函數解析，獲取我們的倆種目標鏈接.小說章節也得鏈接，我們由下一個函書解析get_novel.

（2）get_novel次函數只要獲取小說名稱，小說首頁的鏈接（https://www.biduo.cc/biquge/56_56606/c100952.html）.（這里不獲取所有章節的鏈接，是因為scrapy爬蟲是多線程的，如果獲取所有章節鏈接，那么返回的時候，順序不對.）通過首頁的鏈接，在get_page_content函書獲取整個小說的章節內容.

（3）get_page_content此函數獲取小說每一章節的名稱，內容.這里獲取首頁之后，然后通過翻頁獲取下一頁小說的內容.可以避免返回數據的順序問題.

# -*- coding: utf-8 -*-import scrapyfrom lxml import etreeimport reclass BiqugeSpider(scrapy.Spider): name = "biquge" allowed_domains = ["biduo.cc"] start_urls = ["https://www.biduo.cc/"] def parse(self, response): # print(response.text) pat = r"href="/book_\d+_\d+/">" pat = r"/book_\d+_\d+/" tab_lists = re.findall(pat, response.text) print("****************************") for li in tab_lists: yield scrapy.Request( url="https://www.biduo.cc/"+ li, callback=self.parse, ) pat1 = r"/biquge/\d+_\d+\/" t_lists = re.findall(pat1, response.text) for li in t_lists: # print(li) yield scrapy.Request( url="https://www.biduo.cc" + li, callback=self.get_novel, ) def get_novel(self,response): novel_url = response.url novel_title = response.xpath("http://div[@id="info"]/h1/text()").extract_first() # novel_lists = response.xpath("http://div[@id="list"]/dl/dd/a/@href").extract() novel_first = "https://www.biduo.cc" + response.xpath("http://div[@id="list"]/dl/dd[1]/a/@href").extract_first() yield scrapy.Request( url = novel_first, callback=self.get_page_content, meta={"novel_title":novel_title,"novel_url":novel_url} ) def get_page_content(self,response): item = {} item["novel_title"] = response.meta["novel_title"] item["novel_url"] = response.meta["novel_url"] item["page_title"] = response.xpath("http://h1/text()").extract_first() item["page_url"] = response.url item["page_content"] = "".join(response.xpath("http://div[@id="content"]/text()").extract()).replace("\xa0","") # item["page_content"] = response.xpath("http://div[@id="content"]/text()").extract() yield item next1 = response.xpath("http://div[@class="bottem2"]/a[3]/@href").extract_first() # //*[@id="wrapper"]/div[4]/div/div[2]/div[1]/a[3] print(next1) next_url = "https://www.biduo.cc" + next1 # response.urljoin(next_url) if next_url != item["novel_url"]: yield scrapy.Request( url = next_url, callback=self.get_page_content, meta={"novel_title":item["novel_title"],"novel_url":item["novel_url"]} )

3pipline文件

# -*- coding: utf-8 -*-# Define your item pipelines here## Don"t forget to add your pipeline to the ITEM_PIPELINES setting# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.htmlclass BqgPipeline(object): def start(self,item,spider): self.file = open("novels/{}.txt".format(item["novel_title"]), "a+", encoding="utf-8") def process_item(self, item, spider): self.file = open("novels/{}.txt".format(item["novel_title"]), "a+", encoding="utf-8") print(item["page_title"]) self.file.write(item["page_title"]+"\n") self.file.write(item["page_url"]+"\n") self.file.write(item["page_content"]+"\n") return item def closed(self,item,spider): self.file.close()

4運行效果：

5 源碼已上傳到我的資源里面.如果需要可以下載.

責任編輯：

標簽：

上一篇：pandas的時間日期高效操作
下一篇：最后一頁