却道天凉好个秋~
- #!/usr/bin/env python
- # -*- coding: utf-8 -*-
- # @Time : 2018/4/24 0024 12:47
- # @Author : konmin
- # @Site :
- # @File : test.py
- # @Software: PyCharm
- '''
- 本爬虫爬取种子磁力站的链接并对结果整理进execel表
- https://m.zhongziso.com/list/mila%20azul/1
- https://m.zhongziso.com/list_ctime/mila+azul/2
- https://m.zhongziso.com/list_ctime/mila+azul/3
- https://m.zhongziso.com/list_ctime/%E6%B3%B7%E6%B3%BD%E8%90%9D%E6%8B%89/7
- '''
- import requests
- from bs4 import BeautifulSoup
- import xlwt
- import random
- import re
- import sys
- reload(sys)
- ua_list = [
- "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1" ,
- "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
- "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
- "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
- "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
- "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
- "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
- "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
- "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
- "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3"
- ]
- headers = {"user-agent":random.choice(ua_list)}
- times =[]
- names = []
- sizes = []
- margnets1 = []
- margnets2 = []
- def test(text):
- req = requests.get('https://m.zhongziso.com/list_ctime/%s/' % text + '1', headers=headers)
- if req.status_code==200:
- soup = BeautifulSoup(req.content, 'lxml', from_encoding='utf-8')
- final_sum = soup.find('a', text='尾页')['href']
- final = re.search(r'/\d+', final_sum).group()
- end_sum = int(final.strip('/'))
- # print end_sum
- return end_sum
- else:
- return 0
- def first_htm(num,text):
- req = requests.get('https://m.zhongziso.com/list_ctime/%s/'%text + str(num),headers=headers)
- soup = BeautifulSoup(req.content,'lxml',from_encoding='utf-8')
- name = soup.find_all('a',class_="text-success")
- margnet = soup.find_all('a',class_="btn btn-success")
- size = soup.find_all('dd',class_="text-time")
- time = soup.find_all('dd',class_="text-size")
- for a1 in margnet[::2]:
- margnets1.append(a1)
- for a2 in margnet[1::2]:
- margnets2.append(a2)
- for b in size:
- sizes.append(b)
- for c in time:
- times.append(c)
- for d in name:
- names.append(d)
- def save_file(text):
- xls = xlwt.Workbook()
- sheet1 = xls.add_sheet('sheet1')
- row0 = [u'名称',u'大小',u'时间',u'迅雷',u'磁力']
- for i in range(0,5):
- sheet1.write(0,i,row0[i])
- t1 = t2 = t3 = t4 = t5 = 1
- for a1 in margnets1:
- sheet1.write(t1,4,a1['href'])
- t1+=1
- for a2 in margnets2:
- sheet1.write(t2, 3, a2['href'])
- t2 += 1
- for b in sizes:
- sheet1.write(t3,2,b.get_text())
- t3+=1
- for c in times:
- sheet1.write(t4,1,c.get_text())
- t4+=1
- for d in names:
- sheet1.write(t5,0,d.get_text())
- t5+=1
- xls.save(str(text).decode('utf-8').encode('gbk')+'_all_resource.xls')
- if __name__ == "__main__":
- print sys.getdefaultencoding()
- text = raw_input(u'你搜索的是?')
- # print text
- end_num = test(text)
- if end_num>0:
- print '共有'+str(end_num),'页,开始爬取'+ text +'的链接...\n'
- for j in range(1,end_num+1):
- print j
- first_htm(j,text)
- save_file(text)
- print '存取完毕!可以尽情欣赏啦!'
- else:
- print '啊呀,失败了!'
以上,基本没有什么特殊的知识点,几个库及其使用方法在之前的爬虫实例中都有讲解,这次主要多了个xlwt库,还有一个是xlrd库,这两个库实现对execel的读写,基本如此,因为我只是爬取数据玩玩,目前不会使用到数据库,并且我也不会使用,用execel存取数据对目前小白式的爬虫足以应付了。
由于爬虫维护太频繁,上次发的“九妹图社”的爬虫因为网站改版(说不定就是因为一直被爬,站长受不了了),打包成的exe可执行程序已经无法使用,这次这个就不打包了,想直接使用的小白,嗯,很简单的,自己动手按着我这个写一个吧!
好了,如果你有什么疑问,欢迎留言,留下邮箱便可以收到回复。