0%

爬虫进阶

反爬机制

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
# -*- coding:utf-8 -*-
# @Time :2022/8/11 9:28
# @SOFTWARE :爬虫学习

import requests

"""
反爬的三个方向:
1.基于身份识别进行反爬
headers:
ua
referer
cookie

请求传参数:
1.从htmml文件中提取
2.发送请求获取数据
3.通过js生成
4.通过验证码
2.基于爬虫行为进行反爬
基于请求频率或总请求数量
通过请求ip/账号单位时间内总请求数量进行反爬
通过同一ip/账号请求时间的间隔进行反爬
通过对请求ip/账号每天请求次数设置阀值进行反爬

基于爬取行为进行反爬 通常在爬取步骤上做分析
通过js实现跳转来反爬
通过蜜罐(陷阱)获取爬虫ip(或者代理ip) 进行ip封锁
通过假数据反爬
阻塞任务队列
阻塞网络io
3.基于数据加密进行反爬
对响应中含有的数据进行特殊化处理
自定义字体
css
js生成
图片
编码格式

"""
  • 通过代理进行反爬
1
2
3
4
5
6
7
8
9
10
11
12
13
# -*- coding:utf-8 -*-
# @Time :2022/1/10 11:47
# @SOFTWARE :爬虫学习

import requests

proxies = {
"http": "http://代理Ip地址",
"https" : "https://代理Ip地址"
}
url = '访问的域名'
resp = requests.get(url,proxies = proxies)

  • 代理池

    • ```python

      -- coding:utf-8 --

      @Time :2022/9/4 18:24

      @SOFTWARE :爬虫

      import requests
      from lxml import etree
      from bs4 import BeautifulSoupbase_url = ‘https://tmplay3.tom1881.com/2022-08-29/32f4a6ff34d6893df1c0b6f53db8fed7/m3u8/enc.key'
      headers = {
      'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36 Edg/105.0.1343.27'
      
      }
      proxies = {
      "http": "http://" + '47.101.44.122:80'
      
      }resp = requests.get(url = base_url,headers = headers,)

      html = etree.HTML(resp.text)

      proxies_lst = html.xpath(‘//*[@id=”GridViewOrder”]/tbody/tr[position()>1]/td[1]/text()’)

      html = BeautifulSoup(resp.text,’html.parser’)

      proxies_lst = html.find_all(‘tr’,align_=”center”)

      print(proxies_lst)

      with open(‘./enc.key’,’w’) as f:
      f.write(resp.content.decode("ISO-8859-1"))
      
      1
      2
      3
      4
      5
      6
      7
      8
      9
      10
      11
      12
      13
      14
      15
      16
      17
      18
      19
      20
      21
      22
      23
      24
      25
      26
      27
      28
      29
      30
      31
      32
      33
      34

      -

      - 防盗链-梨视频案例

      ```python
      # -*- coding:utf-8 -*-
      # @Time :2022/1/10 11:32
      # @SOFTWARE :爬虫学习

      import requests

      url = "https://www.pearvideo.com/video_1747197"
      contId = url.split("_")[1]

      videoStatusUrl = f"https://www.pearvideo.com/videoStatus.jsp?contId={contId}"
      headers ={
      "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36 Edg/96.0.1054.62",
      "Referer":url
      }
      resp = requests.get(videoStatusUrl,headers = headers)
      dic = resp.json()
      srcUrl= dic['videoInfo']['videos']['srcUrl']
      systemTime = dic['systemTime']
      srcUrl = srcUrl.replace(systemTime,f"cont-{contId}")
      # print(srcUrl)

      #https://video.pearvideo.com/mp4/short/20220105/cont-1747197-15813272-hd.mp4
      #https://video.pearvideo.com/mp4/short/20220105/count-1747197-15813272-hd.mp4
      #https://video.pearvideo.com/mp4/short/20220105/count-1747197-15813272-hd.mp4

      with open("a.mp4",mode="wb") as f:
      f.write(requests.get(srcUrl).content)
      print("over!")
  • 模拟登录某小说网站

    • ```python

      -- coding:utf-8 --

      @Time :2022/1/10 10:29

      @SOFTWARE :爬虫学习

      登录 -> 得到cookie

      带着cookie 去请求到书架的url -> 书架上的内容

      必须把两个操作连起来

      我们可以使用session 进行请求 -> session 可以认为是一连串的请求,在这个过程中的cookie是不会丢失的

      import requests#1.登录
      session = requests.session()
      url = “https://passport.17k.com/ck/user/login"
      data ={
      "loginName" : "13593181493",
      "password" : "yy963987"
      
      }
      resp = session.post(url,data=data)

      print(resp.cookies)

      #2.获取书架内容shelf_url = “https://user.17k.com/ck/author/shelf?page=1&appKey=2406394919"
      resp_shelf =session.get(shelf_url)
      print(resp_shelf.json())
      1
      2
      3
      4
      5
      6
      7
      8
      9
      10
      11
      12
      13
      14
      15
      16
      17
      18
      19
      20
      21
      22
      23
      24
      25
      26
      27
      28
      29
      30
      31
      32
      33
      34
      35
      36
      37
      38
      39
      40
      41
      42
      43
      44
      45
      46
      47
      48
      49
      50
      51
      52
      53
      54
      55
      56
      57
      58
      59
      60
      61
      62
      63
      64
      65
      66
      67
      68
      69
      70
      71
      72
      73
      74
      75
      76
      77
      78
      79
      80
      81
      82
      83
      84
      85
      86
      87
      88
      89
      90
      91
      92
      93
      94
      95
      96
      97
      98
      99
      100
      101
      102
      103

      - 综合训练--爬取网易云热评

      - ```python
      # -*- coding:utf-8 -*-
      # @Time :2022/4/27 10:15
      # @SOFTWARE :爬虫学习

      import requests
      from Crypto.Cipher import AES
      from base64 import b64encode
      import json

      """"
      1.找到未加密的参数
      2.参考网易加密逻辑进行模拟加密 params ,encSecKey
      3.请求到网易拿到检索信息
      """

      url = "https://music.163.com/weapi/comment/resource/comments/get?csrf_token="
      # 请求方式是POST
      data = {
      "cursor": "-1",
      "offset": "0",
      "orderType": "1",
      "pageNo": "1",
      "pageSize": "20",
      "rid": "R_SO_4_1481164987",
      "threadId": "R_SO_4_1481164987"
      }
      e = "010001"
      f = '00e0b509f6259df8642dbc35662901477df22677ec152b5ff68ace615bb7b725152b3ab17a876aea8a5aa76d2e417629ec4ee341f56135fccf695280104e0312ecbda92557c93870114af6c9d05c4f7f0c3685b7a46bee255932575cce10b424d813cfe4875d3e82047b97ddef52741d546b8e289dc6935b3ece0462db0a22b8e7'
      g = '0CoJUm6Qyw8W8jud'
      i = "8oWRZx8h4JbKIPKl"
      # 处理加密过程
      def get_encSecKey():
      return "8a8026c2e3c67c819062df7236879f6d5674a095e46112201958d79c0eb58d866e41ac1bb86f76531c02c52a40e5de751a723bb3a549df1f494f12c2d824d7e94961079a96caf436f677a7453b2d274b27667b71a1631a058d9f68c0f162d509f540b554064570a58f7e4d78c9cd9aebfd6a219bf943d80b09808fd7e38c0123"

      def to_16(data):
      pad = 16 - len(data) % 16
      data += chr(pad) * pad
      return data

      def get_params(data): # 默认收到的是字符串 data此时是字典
      first = enc_params(data, g)
      second = enc_params(first,i)
      return second # 返回的就是params

      # 加密过程
      def enc_params(data,key):
      iv = "0102030405060708"
      data = to_16(data)
      aes = AES.new(key = key.encode("utf-8"),iv= iv.encode("utf-8"),mode = AES.MODE_CBC) #创建加密器
      bs = aes.encrypt(data.encode("utf-8")) # 加密 加密的内容必须是16的倍数,不满16位,补齐chr(16-n), 正好是16位,补16个 chr(16)
      return str(b64encode(bs),"utf-8") # 转化成字符串返回

      """
      加密函数
      window.asrsea(JSON.stringify(i7b), buV6P(["流泪", "强"]), buV6P(Rg1x.md), buV6P(["爱心", "女孩", "惊恐", "大笑"]))

      function a(a) { # 随机的16为字符串
      var d, e, b = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789", c = "";
      for (d = 0; a > d; d += 1) # 循环16次
      e = Math.random() * b.length, # 随机数
      e = Math.floor(e), # 取整
      c += b.charAt(e); # 取字符串末位
      return c
      }
      function b(a, b) { # a是要加密的内容
      var c = CryptoJS.enc.Utf8.parse(b) # b 是秘钥
      , d = CryptoJS.enc.Utf8.parse("0102030405060708")
      , e = CryptoJS.enc.Utf8.parse(a) # e是数据
      , f = CryptoJS.AES.encrypt(e, c, { # c是加密的秘钥
      iv: d, # 偏移量
      mode: CryptoJS.mode.CBC # 加密模式 CBC
      });
      return f.toString()
      }
      function c(a, b, c) {
      var d, e;
      return setMaxDigits(131),
      d = new RSAKeyPair(b,"",c),
      e = encryptedString(d, a)
      }
      function d(d, e, f, g) { d:参数 e:"010001" f:... g:'0CoJUm6Qyw8W8jud'
      var h = {} # 空对象
      , i = a(16); # i 为16位随机数
      h.encText = b(d, g), # g就是秘钥
      h.encText = b(h.encText, i), # 返回的就是params ,i为秘钥
      h.encSecKey = c(i, e, f), #返回的是encSecKey , i 为随机 e,f为固定值 -> 故每次产生的不同都是出自于i ,将i固定,encSecKey 为固定
      return h
      }
      function e(a, b, d, e) {
      var f = {};
      return f.encText = c(a + e, b, d),

      """

      resp = requests.post(url,data = {
      "params" : get_params(json.dumps(data)),
      "encSecKey" : get_encSecKey()
      })
      print(resp.text)