一、某吧爬取图片
- ```python
-- coding:utf-8 --
@Time :2022/8/9 23:42
@SOFTWARE :爬虫学习
import requests
from lxml import etreeclass Tieba():
if name == ‘main‘:def __init__(self,name): self.url = "https://tieba.baidu.com/f?ie=utf-8&kw={}".format(name) self.header = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.5112.81 Safari/537.36 Edg/104.0.1293.47" } def get_data(self,url): res = requests.get(url,headers = self.header) return res.content def parse_data(self,data): data = data.decode() print(data) html = etree.HTML(data) el_list = html.xpath('/html/body/div[3]/div/div[2]/div/div/div[1]/div/div/div/div[4]/ul/li[3]/div/div[2]/div[1]/div[1]/a') print(el_list) def run(self): # url # header # 发送请求 获取相应 data = self.get_data(self.url) # 从响应中提取数据(数据和翻页用的url) self.parse_data(data) # 判断是否终结
tieba = Tieba("汽车") tieba.run()
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
## 二、小木虫论坛自动化签到
- ````python
# -*- coding:utf-8 -*-
# @Time :2022/8/14 12:06
# @SOFTWARE :爬虫学习
import requests
from lxml import etree
import re
def log_in():
session = requests.session()
# 登录页面网址 拿到 formhash 和 t值
url_login = 'http://muchong.com/bbs/logging.php?action=login'
login_page = session.get(url_login,)
login_page_html = etree.HTML(login_page.text)
# formhash
formhash = login_page_html.xpath('//input[@name="formhash"]/@value')[0]
# print(formhash)
# t值
log_t= login_page_html.xpath('//form[@method="post"]/@action')[0].split("&t=")[-1]
# print(log_t)
# 登录网址
log_url = 'http://muchong.com/bbs/logging.php?action=login&t=' + log_t
# 表单数据
my_data = {'formhash': formhash,
'refer': '',
'username': 'bajibajihe',
'password': 'Yzh963987',
'cookietime': '31536000',
'loginsubmit': '提交'}
res = session.post(url=log_url,data=my_data)
# print(res.text)
res_html = etree.HTML(res.text)
varify_content = res_html.xpath('//div[@style = "padding:10px 0;"]//text()')[0]
varify = re.search(u'问题:(\d+)(\D+)(\d+)等于多少?', varify_content)
number1 = int(varify.group(1))
number2 = int(varify.group(3))
if varify.group(2) == '加':
my_answer = number1 + number2
elif varify.group(2) == '减':
my_answer = number1 - number2
elif varify.group(2) == '乘以':
my_answer = number1 * number2
else:
my_answer = number1 / number2
post_sec_hash = res_html.xpath('//input[@name = "post_sec_hash"]/@value')[0]
# print(post_sec_hash)
new_data = {
'formhash': formhash,
'post_sec_code': my_answer,
'post_sec_hash': post_sec_hash,
'username': 'bajibajihe',
'loginsubmit': '提交'
}
res1 = session.post(log_url,new_data)
# print(res1.text)
headers ={
'Cookie':'_ga=GA1.2.218366420.1658801280; _discuz_cc=36661247307489666; last_ip=120.208.99.4_30246316; Hm_lvt_2207ecfb7b2633a3bc5c4968feb58569=1659937011,1659946720,1660358771,1660449877; _emuch_index=1; discuz_tpl=qing; acw_tc=276077cc16604961356387283e9bcad83fdbb4938555f633de690328b12936; _discuz_uid=30246316; _discuz_pw=b77e1e08db437cc8; _gat=1; Hm_lpvt_2207ecfb7b2633a3bc5c4968feb58569=1660497901',
'Host': 'muchong.com',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.5112.81 Safari/537.36 Edg/104.0.1293.54'
}
resp = session.get('http://muchong.com/bbs/memcp.php?action=getcredit')
# print(resp.text)
res2 = etree.HTML(resp.text)
# ['您已经领取今天的红包啦,\n坚持哦,已经连续 2 天坚持领取红包了\n', '\n']
# print(res2.xpath('//div[@style = "padding:0 0 10px 0;"]/b[1]/text()')[0].replace('\n','').split(',')[0])
if res2.xpath('//div[@style = "padding:0 0 10px 0;"]/b[1]/text()')[0].replace('\n','').split(',')[0] == "您已经领取今天的红包啦":
return "签到成功 您现在的金币数:" ,res2.xpath('//span[@style = "color:red;font-weight:bold;font-size:20px;"]/text()')[0]
else:
credit_formhash = res2.xpath('//input[@name="formhash"]/@value')[0]
credit_data = {'formhash': credit_formhash,
'getmode': '1',
'message': '',
'creditsubmit': '领取红包'}
r = session.post('http://muchong.com/bbs/memcp.php?action=getcredit', data=credit_data)
get_coins_number = etree.HTML(r.text).xpath('//span[@style="color:red;font-weight:bold;font-size:30px;"]//text()')[0]
coins = etree.HTML(r.text).xpath('//span[@style="color:red;font-weight:bold;font-size:20px;"]/text()')[0]
return '今天领取了金币数为:%s' % get_coins_number,'目前的总金币数为:%s' % coins
def push(res):
url = 'https://sctapi.ftqq.com/SCT165450TxflxDTaPnd9KfYpxBDj40ee3.send'
data = {
"title": '小木虫签到',
"desp": res
}
requests.post(url, data)
def main():
res = log_in()
push(res)
if __name__ == '__main__':
main()
````
## 三、瑞客论坛自动签到
- ````python
# -*- coding:utf-8 -*-
# @Time :2022/8/13 11:01
# @SOFTWARE :爬虫学习
import requests
def qiandao():
session = requests.session()
url_cook = 'https://www.ruike1.com/k_misign-sign.html?operation=qiandao&format=global_usernav_extra&formhash=c6f8a78f&inajax=1&ajaxtarget=k_misign_topb'
headers = {
# 'referer':' https://www.ruike1.com/home.php?mod=space&do=pm',
# 'user-agent':' Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.5112.81 Safari/537.36 Edg/104.0.1293.54',
'cookie': 'Hm_lvt_9d4dff14662c55b14fcd14fd34b8dcd8=1650421290,1650440143,1650593469,1650679950; zzhh_2132_saltkey=ptF5Jfl1; zzhh_2132_lastvisit=1660527542; zzhh_2132_ulastactivity=54adnP9uL38shJITAAmPYGl8UhDcwc6fyMZ0gquuRIvYA5KN6PR6; zzhh_2132_connect_is_bind=1; zzhh_2132_nofavfid=1; zzhh_2132_newemail=23994%091946497315%40qq.com%091660531193; zzhh_2132_sid=EO5ApZ; Hm_lvt_73ad58a7cf08cf5833714aed91aa7068=1660355572,1660445351,1660530897,1660535769; zzhh_2132_auth=1187HYNSK3dMtNMMciXLfjmKJwWKBC9uzwGER%2BIaOKlUhqLC%2FHqAUBAxNDArZq4pp%2BaYe9IpyVPmvhc90%2FeZvn6kYg; zzhh_2132_lastcheckfeed=23994%7C1660535772; zzhh_2132_lip=120.208.99.4%2C1660447350; zzhh_2132_seccodecSEO5ApZ=2.f08206f3c72f8f6fa1; zzhh_2132_sendmail=1; zzhh_2132_checkpm=1; Hm_lpvt_73ad58a7cf08cf5833714aed91aa7068=1660536077; zzhh_2132_lastact=1660536077%09misc.php%09patch'
}
res = session.get(url = url_cook,headers=headers)
return res.text.split('CDATA')[-1]
def push(res):
url = 'https://sctapi.ftqq.com/SCT165450TxflxDTaPnd9KfYpxBDj40ee3.send'
data = {
"title": '瑞客签到',
"desp": res
}
requests.post(url, data)
def main():
res = qiandao()
push(res)
if __name__ == '__main__':
main()
````
## 四、爬取某牙直播封面
- ```python
# -*- coding:utf-8 -*-
# @Time :2022/8/22 10:24
# @SOFTWARE :爬虫学习
"""
思路:
1、和虎牙直播网址建立连接
2、获取源代码
3、解析源代码中图片的地址
4、访问图片地址 保存到本地
"""
import requests
from lxml import etree
# 1. 获取网址的请求方式 get
url = 'https://www.huya.com/g/xingxiu#cate-1-27'
# url = 'https://www.baidu.com'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.5112.102 Safari/537.36 Edg/104.0.1293.63'
}
res = requests.get(url = url,headers = headers)
print(res.content)
html = etree.HTML(res.content)
lis = html.xpath('//*[@id="js-live-list"]/li/a[1]/img/@src')
print(lis)
五、爬取某瓣top250
# -*- coding:utf-8 -*- # @Time :2022/8/23 11:17 # @SOFTWARE :爬虫学习 """ 1.与目标地址建立连接 2.获取要爬取内容的位置 3.提取内容保存 """ import requests from lxml import etree def spider(): url = 'https://movie.douban.com/top250' headers = { 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.5112.102 Safari/537.36 Edg/104.0.1293.63' } res = requests.get(url= url,headers =headers) print(res.content.decode()) html = etree.HTML(res.text) # title = html.xpath('//*[@id="content"]/div/div[1]/ol/li/div/div[2]/div[1]/a/span[1]/text()') # print(title) # remark = html.xpath('//*[@id="content"]/div/div[1]/ol/li/div/div[2]/div[2]/div/span[4]') # print(remark) # grade = html.xpath('//*[@id="content"]/div/div[1]/ol/li/div/div[2]/div[2]/div/span[2]') # center = html.xpath('//*[@id="content"]/div/div[1]/ol/li/div/div[2]/div[2]/p[2]/span') # with open('top250.csv','wb') as f: # f.write(title) # f.close() if __name__ == '__main__': spider()
六、某讯视频自动签到
- ```python
-- coding:utf-8 --
@Time :2022/9/2 10:35
@SOFTWARE :爬虫
import requests
from requests import post腾讯Cookie,
tx_cookie = ‘RK=J1kB4/jBcl; ptcz=7a4c2ebc2ec907eff3406b538c1345e746da0344ffe25c6d815a3e9d290b59ef; pgv_pvid=3235502464; fqm_pvqid=9add1a34-0489-4999-be56-071778ac10ad; eas_sid=o1B6Q4E398W9b9Q8v7V7M6h1c5; LW_uid=51U6D5T3G5f3C9H6N248U7R6o9; uin_cookie=o1946497315; ied_qq=o1946497315; o_cookie=1946497315; pac_uid=1_1946497315; iip=0; tvfe_boss_uuid=d6a51a1d80eaf478; video_platform=2; LW_sid=i1h6l68280P4J96179D72027U7; pgv_info=ssid=s1080425395; video_guid=10415ddb7eef607d; qpsvr_localtk=0.19202484948915632; main_login=qq; vqq_access_token=12FF3453F36D7A6B7CCD35A8DF65D7B2; vqq_appid=101483052; vqq_openid=095EA2916C3F9224C96BA1E30A10677B; vqq_vuserid=193206557; vqq_vusession=uFlrAOGUtFjNR0rJbJ-ZTg.N; vqq_refresh_token=6649F75421F7722F23F0EFBCA11A61A4; login_time_init=2022-9-2 10:43:18; vqq_next_refresh_time=6596; vqq_login_time_init=1662086600’
auth_refresh_url = ‘https://access.video.qq.com/user/auth_refresh?vappid=11059694&vsecret=fdf61a6be0aad57132bc5cdf78ac30145b6cd2c1470b0cfe&type=qq&g_tk=&g_vstk=86707172&g_actk=110063020&callback=jQuery191048904872757789786_1662086599463&=1662086599481’新版Server酱推送
def send_server(title, content):server_content = {'text': title, 'desp': content} server_url = "https://sctapi.ftqq.com/%s.send" % server_key resp = requests.post(server_url, params=server_content) print('新版Server酱推送状态码为: %s' % resp.status_code)
腾讯视频签到
def tx_sign():url1 = 'https://vip.video.qq.com/fcgi-bin/comm_cgi?name=hierarchical_task_system&cmd=2' url2 = 'https://v.qq.com/x/bu/mobile_checkin' # url3 = 'https://vip.video.qq.com/fcgi-bin/comm_cgi?name=spp_MissionFaHuo&cmd=4&task_id=1' # 观看60分钟 # url4 = 'https://vip.video.qq.com/fcgi-bin/comm_cgi?name=spp_MissionFaHuo&cmd=4&task_id=7' # 下载 # url5 = 'https://vip.video.qq.com/fcgi-bin/comm_cgi?name=spp_MissionFaHuo&cmd=4&task_id=6' # 赠送 # url6 = 'https://vip.video.qq.com/fcgi-bin/comm_cgi?name=spp_MissionFaHuo&cmd=4&task_id=3' # 弹幕 login_headers = { 'Referer': 'https://v.qq.com', 'Cookie': tx_cookie } login = requests.get(auth_refresh_url, headers=login_headers) resp_cookie = requests.utils.dict_from_cookiejar(login.cookies) if not resp_cookie: send_server('腾讯视频V力值签到通知\n\n' + '获取Cookie失败,Cookie失效') arr = tx_cookie.split('; ') sign_cookie = '' for str in arr: if 'vqq_vusession' in str: continue else: sign_cookie += (str + '; ') sign_cookie += ('vqq_vusession=' + resp_cookie['vqq_vusession'] + ';') sign_headers = { 'Cookie': sign_cookie, 'Referer': 'https://m.v.qq.com' } send_message = '' sign1 = response_handle(url1, sign_headers) send_message += '链接1' + sign1 + '\n' # sign2 = response_handle(url2, sign_headers) send_message += '链接2' + '任务未完成' + '\n' # sign3 = response_handle(url3, sign_headers) # send_message += '链接3' + sign3 + '\n' # sign4 = response_handle(url4, sign_headers) # send_message += '链接4' + sign4 + '\n' # sign5 = response_handle(url5, sign_headers) # send_message += '链接5' + sign5 + '\n' # sign6 = response_handle(url6, sign_headers) # send_message += '链接6' + sign6 + '\n' mes = '腾讯视频V力值签到通知\n\n' + send_message return mes
处理腾讯视频返回结果
def response_handle(url, sign_headers):
if name == ‘main‘:resp_str = requests.get(url, headers=sign_headers).text if '-777903' in resp_str: return "已获取过V力值" elif '-777902' in resp_str: return "任务未完成" elif 'OK' in resp_str: return "成功,获得V力值:" + resp_str[42:-14] else: return "执行出错"
message = tx_sign() print(message) # send_server('腾讯视频签到通知', message)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
## 七、某链家
- ```python
# -*- coding:utf-8 -*-
# @Time :2022/8/24 9:41
# @SOFTWARE :爬虫
"""
1.通过网址建立连接获取源码
2.解析源码 提取数据
3.进行数据可视化
"""
import requests
from lxml import etree
import pandas as pd
from pyecharts.charts import Bar # 柱状图
def spider():
# 1、获取网页源码
url = 'https://ty.lianjia.com/ershoufang/'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.5112.102 Safari/537.36 Edg/104.0.1293.63'
}
res = requests.get(url=url,headers=headers)
# 2、数据解析
html = etree.HTML(res.text)
div_list = html.xpath('//div[@class = "info clear"]')
# 3.数据提取
# 3.1设置空列表存储房屋数据
second_house = []
for div in div_list:
# 1.小区的名称
position = div.xpath('.//div[@class = "positionInfo"]/a/text()')[0]
# 2.小区房屋的总价
total_price = div.xpath('//div[@class = "totalPrice totalPrice2"]/span/text()')[0]
# 3.小区房屋的单价
unitPrice = div.xpath('//div[@class = "unitPrice"]/span/text()')[0].replace("元/平","").replace(",","")
second_house.append([position,float(unitPrice),float(total_price)])
title = ["小区名称","单价:元/平","总价:万"]
# 4.将数据保存到本地
table = pd.DataFrame(data=second_house,columns=title)
# 5.可视化展示 柱状图
bar = Bar()
# 设置 横坐标和 纵坐标
bar.add_xaxis(list(table["小区名称"]))
bar.add_yaxis("太原二手房价格",list(table["单价:元/平"]))
# 写出到HTML
bar.render("./second_house_TaiYuai.html")
if __name__ == '__main__':
spider()