0%

爬虫案例

一、某吧爬取图片

  • ```python

    -- coding:utf-8 --

    @Time :2022/8/9 23:42

    @SOFTWARE :爬虫学习

    import requests
    from lxml import etreeclass Tieba():
    def __init__(self,name):
        self.url = "https://tieba.baidu.com/f?ie=utf-8&kw={}".format(name)
        self.header = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.5112.81 Safari/537.36 Edg/104.0.1293.47"
        }
    
    def get_data(self,url):
        res = requests.get(url,headers = self.header)
        return res.content
    
    def parse_data(self,data):
        data = data.decode()
        print(data)
        html = etree.HTML(data)
    
        el_list = html.xpath('/html/body/div[3]/div/div[2]/div/div/div[1]/div/div/div/div[4]/ul/li[3]/div/div[2]/div[1]/div[1]/a')
        print(el_list)
    
    
    
    def run(self):
        # url
        # header
        # 发送请求 获取相应
        data = self.get_data(self.url)
        # 从响应中提取数据(数据和翻页用的url)
        self.parse_data(data)
        # 判断是否终结
    
    if name == ‘main‘:
    tieba = Tieba("汽车")
    tieba.run()
    
    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    33
    34
    35
    36
    37
    38
    39
    40
    41
    42
    43
    44
    45
    46
    47
    48
    49
    50
    51
    52
    53
    54
    55
    56
    57
    58
    59
    60
    61
    62
    63
    64
    65
    66
    67
    68
    69
    70
    71
    72
    73
    74
    75
    76
    77
    78
    79
    80
    81
    82
    83
    84
    85
    86
    87
    88
    89
    90
    91
    92
    93
    94
    95
    96
    97
    98
    99
    100
    101
    102
    103
    104
    105
    106
    107
    108
    109
    110
    111
    112
    113
    114
    115
    116
    117
    118
    119
    120
    121
    122
    123
    124
    125
    126
    127
    128
    129
    130
    131
    132
    133
    134
    135
    136
    137
    138
    139
    140
    141
    142
    143
    144
    145
    146
    147
    148
    149
    150
    151
    152
    153
    154
    155
    156
    157
    158
    159
    160
    161
    162
    163
    164
    165
    166
    167
    168
    169
    170
    171
    172
    173
    174
    175
    176
    177
    178
    179
    180
    181
    182
    183
    184

    ## 二、小木虫论坛自动化签到

    - ````python
    # -*- coding:utf-8 -*-
    # @Time :2022/8/14 12:06
    # @SOFTWARE :爬虫学习

    import requests
    from lxml import etree
    import re


    def log_in():
    session = requests.session()
    # 登录页面网址 拿到 formhash 和 t值
    url_login = 'http://muchong.com/bbs/logging.php?action=login'
    login_page = session.get(url_login,)
    login_page_html = etree.HTML(login_page.text)

    # formhash
    formhash = login_page_html.xpath('//input[@name="formhash"]/@value')[0]
    # print(formhash)

    # t值
    log_t= login_page_html.xpath('//form[@method="post"]/@action')[0].split("&t=")[-1]
    # print(log_t)

    # 登录网址
    log_url = 'http://muchong.com/bbs/logging.php?action=login&t=' + log_t

    # 表单数据
    my_data = {'formhash': formhash,
    'refer': '',
    'username': 'bajibajihe',
    'password': 'Yzh963987',
    'cookietime': '31536000',
    'loginsubmit': '提交'}

    res = session.post(url=log_url,data=my_data)
    # print(res.text)
    res_html = etree.HTML(res.text)
    varify_content = res_html.xpath('//div[@style = "padding:10px 0;"]//text()')[0]
    varify = re.search(u'问题:(\d+)(\D+)(\d+)等于多少?', varify_content)
    number1 = int(varify.group(1))
    number2 = int(varify.group(3))
    if varify.group(2) == '加':
    my_answer = number1 + number2
    elif varify.group(2) == '减':
    my_answer = number1 - number2
    elif varify.group(2) == '乘以':
    my_answer = number1 * number2
    else:
    my_answer = number1 / number2


    post_sec_hash = res_html.xpath('//input[@name = "post_sec_hash"]/@value')[0]
    # print(post_sec_hash)
    new_data = {
    'formhash': formhash,
    'post_sec_code': my_answer,
    'post_sec_hash': post_sec_hash,
    'username': 'bajibajihe',
    'loginsubmit': '提交'
    }
    res1 = session.post(log_url,new_data)
    # print(res1.text)
    headers ={
    'Cookie':'_ga=GA1.2.218366420.1658801280; _discuz_cc=36661247307489666; last_ip=120.208.99.4_30246316; Hm_lvt_2207ecfb7b2633a3bc5c4968feb58569=1659937011,1659946720,1660358771,1660449877; _emuch_index=1; discuz_tpl=qing; acw_tc=276077cc16604961356387283e9bcad83fdbb4938555f633de690328b12936; _discuz_uid=30246316; _discuz_pw=b77e1e08db437cc8; _gat=1; Hm_lpvt_2207ecfb7b2633a3bc5c4968feb58569=1660497901',
    'Host': 'muchong.com',
    'Upgrade-Insecure-Requests': '1',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.5112.81 Safari/537.36 Edg/104.0.1293.54'
    }
    resp = session.get('http://muchong.com/bbs/memcp.php?action=getcredit')

    # print(resp.text)

    res2 = etree.HTML(resp.text)
    # ['您已经领取今天的红包啦,\n坚持哦,已经连续 2 天坚持领取红包了\n', '\n']
    # print(res2.xpath('//div[@style = "padding:0 0 10px 0;"]/b[1]/text()')[0].replace('\n','').split(',')[0])
    if res2.xpath('//div[@style = "padding:0 0 10px 0;"]/b[1]/text()')[0].replace('\n','').split(',')[0] == "您已经领取今天的红包啦":
    return "签到成功 您现在的金币数:" ,res2.xpath('//span[@style = "color:red;font-weight:bold;font-size:20px;"]/text()')[0]
    else:
    credit_formhash = res2.xpath('//input[@name="formhash"]/@value')[0]
    credit_data = {'formhash': credit_formhash,
    'getmode': '1',
    'message': '',
    'creditsubmit': '领取红包'}
    r = session.post('http://muchong.com/bbs/memcp.php?action=getcredit', data=credit_data)
    get_coins_number = etree.HTML(r.text).xpath('//span[@style="color:red;font-weight:bold;font-size:30px;"]//text()')[0]
    coins = etree.HTML(r.text).xpath('//span[@style="color:red;font-weight:bold;font-size:20px;"]/text()')[0]
    return '今天领取了金币数为:%s' % get_coins_number,'目前的总金币数为:%s' % coins

    def push(res):
    url = 'https://sctapi.ftqq.com/SCT165450TxflxDTaPnd9KfYpxBDj40ee3.send'
    data = {
    "title": '小木虫签到',
    "desp": res
    }

    requests.post(url, data)

    def main():
    res = log_in()
    push(res)

    if __name__ == '__main__':
    main()
    ````

    ## 三、瑞客论坛自动签到

    - ````python
    # -*- coding:utf-8 -*-
    # @Time :2022/8/13 11:01
    # @SOFTWARE :爬虫学习

    import requests

    def qiandao():

    session = requests.session()

    url_cook = 'https://www.ruike1.com/k_misign-sign.html?operation=qiandao&format=global_usernav_extra&formhash=c6f8a78f&inajax=1&ajaxtarget=k_misign_topb'

    headers = {
    # 'referer':' https://www.ruike1.com/home.php?mod=space&do=pm',
    # 'user-agent':' Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.5112.81 Safari/537.36 Edg/104.0.1293.54',
    'cookie': 'Hm_lvt_9d4dff14662c55b14fcd14fd34b8dcd8=1650421290,1650440143,1650593469,1650679950; zzhh_2132_saltkey=ptF5Jfl1; zzhh_2132_lastvisit=1660527542; zzhh_2132_ulastactivity=54adnP9uL38shJITAAmPYGl8UhDcwc6fyMZ0gquuRIvYA5KN6PR6; zzhh_2132_connect_is_bind=1; zzhh_2132_nofavfid=1; zzhh_2132_newemail=23994%091946497315%40qq.com%091660531193; zzhh_2132_sid=EO5ApZ; Hm_lvt_73ad58a7cf08cf5833714aed91aa7068=1660355572,1660445351,1660530897,1660535769; zzhh_2132_auth=1187HYNSK3dMtNMMciXLfjmKJwWKBC9uzwGER%2BIaOKlUhqLC%2FHqAUBAxNDArZq4pp%2BaYe9IpyVPmvhc90%2FeZvn6kYg; zzhh_2132_lastcheckfeed=23994%7C1660535772; zzhh_2132_lip=120.208.99.4%2C1660447350; zzhh_2132_seccodecSEO5ApZ=2.f08206f3c72f8f6fa1; zzhh_2132_sendmail=1; zzhh_2132_checkpm=1; Hm_lpvt_73ad58a7cf08cf5833714aed91aa7068=1660536077; zzhh_2132_lastact=1660536077%09misc.php%09patch'
    }

    res = session.get(url = url_cook,headers=headers)


    return res.text.split('CDATA')[-1]
    def push(res):
    url = 'https://sctapi.ftqq.com/SCT165450TxflxDTaPnd9KfYpxBDj40ee3.send'
    data = {
    "title": '瑞客签到',
    "desp": res
    }

    requests.post(url, data)

    def main():
    res = qiandao()
    push(res)

    if __name__ == '__main__':
    main()
    ````

    ## 四、爬取某牙直播封面

    - ```python
    # -*- coding:utf-8 -*-
    # @Time :2022/8/22 10:24
    # @SOFTWARE :爬虫学习


    """
    思路:
    1、和虎牙直播网址建立连接
    2、获取源代码
    3、解析源代码中图片的地址
    4、访问图片地址 保存到本地

    """

    import requests
    from lxml import etree

    # 1. 获取网址的请求方式 get
    url = 'https://www.huya.com/g/xingxiu#cate-1-27'
    # url = 'https://www.baidu.com'
    headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.5112.102 Safari/537.36 Edg/104.0.1293.63'
    }
    res = requests.get(url = url,headers = headers)
    print(res.content)

    html = etree.HTML(res.content)
    lis = html.xpath('//*[@id="js-live-list"]/li/a[1]/img/@src')
    print(lis)

五、爬取某瓣top250

  • # -*- coding:utf-8 -*-
    # @Time       :2022/8/23 11:17
    # @SOFTWARE   :爬虫学习
    
    """
        1.与目标地址建立连接
        2.获取要爬取内容的位置
        3.提取内容保存
    
    """
    
    import requests
    from lxml import etree
    
    def spider():
        url = 'https://movie.douban.com/top250'
        headers = {
            'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.5112.102 Safari/537.36 Edg/104.0.1293.63'
        }
        res = requests.get(url= url,headers =headers)
        print(res.content.decode())
        html = etree.HTML(res.text)
        # title = html.xpath('//*[@id="content"]/div/div[1]/ol/li/div/div[2]/div[1]/a/span[1]/text()')
        # print(title)
        # remark = html.xpath('//*[@id="content"]/div/div[1]/ol/li/div/div[2]/div[2]/div/span[4]')
        # print(remark)
        # grade = html.xpath('//*[@id="content"]/div/div[1]/ol/li/div/div[2]/div[2]/div/span[2]')
        # center = html.xpath('//*[@id="content"]/div/div[1]/ol/li/div/div[2]/div[2]/p[2]/span')
    
        # with open('top250.csv','wb') as f:
        #     f.write(title)
        # f.close()
    
    if __name__ == '__main__':
        spider()
    

六、某讯视频自动签到

  • ```python

    -- coding:utf-8 --

    @Time :2022/9/2 10:35

    @SOFTWARE :爬虫

    import requests
    from requests import post

    腾讯Cookie,

    tx_cookie = ‘RK=J1kB4/jBcl; ptcz=7a4c2ebc2ec907eff3406b538c1345e746da0344ffe25c6d815a3e9d290b59ef; pgv_pvid=3235502464; fqm_pvqid=9add1a34-0489-4999-be56-071778ac10ad; eas_sid=o1B6Q4E398W9b9Q8v7V7M6h1c5; LW_uid=51U6D5T3G5f3C9H6N248U7R6o9; uin_cookie=o1946497315; ied_qq=o1946497315; o_cookie=1946497315; pac_uid=1_1946497315; iip=0; tvfe_boss_uuid=d6a51a1d80eaf478; video_platform=2; LW_sid=i1h6l68280P4J96179D72027U7; pgv_info=ssid=s1080425395; video_guid=10415ddb7eef607d; qpsvr_localtk=0.19202484948915632; main_login=qq; vqq_access_token=12FF3453F36D7A6B7CCD35A8DF65D7B2; vqq_appid=101483052; vqq_openid=095EA2916C3F9224C96BA1E30A10677B; vqq_vuserid=193206557; vqq_vusession=uFlrAOGUtFjNR0rJbJ-ZTg.N; vqq_refresh_token=6649F75421F7722F23F0EFBCA11A61A4; login_time_init=2022-9-2 10:43:18; vqq_next_refresh_time=6596; vqq_login_time_init=1662086600’
    auth_refresh_url = ‘https://access.video.qq.com/user/auth_refresh?vappid=11059694&vsecret=fdf61a6be0aad57132bc5cdf78ac30145b6cd2c1470b0cfe&type=qq&g_tk=&g_vstk=86707172&g_actk=110063020&callback=jQuery191048904872757789786_1662086599463&
    =1662086599481’

    新版Server酱推送

    def send_server(title, content):
    server_content = {'text': title, 'desp': content}
    server_url = "https://sctapi.ftqq.com/%s.send" % server_key
    resp = requests.post(server_url, params=server_content)
    print('新版Server酱推送状态码为: %s' % resp.status_code)
    

    腾讯视频签到

    def tx_sign():
    url1 = 'https://vip.video.qq.com/fcgi-bin/comm_cgi?name=hierarchical_task_system&cmd=2'
    url2 = 'https://v.qq.com/x/bu/mobile_checkin'
    # url3 = 'https://vip.video.qq.com/fcgi-bin/comm_cgi?name=spp_MissionFaHuo&cmd=4&task_id=1'  # 观看60分钟
    # url4 = 'https://vip.video.qq.com/fcgi-bin/comm_cgi?name=spp_MissionFaHuo&cmd=4&task_id=7'  # 下载
    # url5 = 'https://vip.video.qq.com/fcgi-bin/comm_cgi?name=spp_MissionFaHuo&cmd=4&task_id=6'  # 赠送
    # url6 = 'https://vip.video.qq.com/fcgi-bin/comm_cgi?name=spp_MissionFaHuo&cmd=4&task_id=3'  # 弹幕
    login_headers = {
        'Referer': 'https://v.qq.com',
        'Cookie': tx_cookie
    }
    login = requests.get(auth_refresh_url, headers=login_headers)
    resp_cookie = requests.utils.dict_from_cookiejar(login.cookies)
    if not resp_cookie:
        send_server('腾讯视频V力值签到通知\n\n' + '获取Cookie失败,Cookie失效')
    arr = tx_cookie.split('; ')
    sign_cookie = ''
    for str in arr:
        if 'vqq_vusession' in str:
            continue
        else:
            sign_cookie += (str + '; ')
    sign_cookie += ('vqq_vusession=' + resp_cookie['vqq_vusession'] + ';')
    sign_headers = {
        'Cookie': sign_cookie,
        'Referer': 'https://m.v.qq.com'
    }
    send_message = ''
    sign1 = response_handle(url1, sign_headers)
    send_message += '链接1' + sign1 + '\n'
    # sign2 = response_handle(url2, sign_headers)
    send_message += '链接2' + '任务未完成' + '\n'
    # sign3 = response_handle(url3, sign_headers)
    # send_message += '链接3' + sign3 + '\n'
    # sign4 = response_handle(url4, sign_headers)
    # send_message += '链接4' + sign4 + '\n'
    # sign5 = response_handle(url5, sign_headers)
    # send_message += '链接5' + sign5 + '\n'
    # sign6 = response_handle(url6, sign_headers)
    # send_message += '链接6' + sign6 + '\n'
    mes = '腾讯视频V力值签到通知\n\n' + send_message
    return mes
    

    处理腾讯视频返回结果

    def response_handle(url, sign_headers):
    resp_str = requests.get(url, headers=sign_headers).text
    if '-777903' in resp_str:
        return "已获取过V力值"
    elif '-777902' in resp_str:
        return "任务未完成"
    elif 'OK' in resp_str:
        return "成功,获得V力值:" + resp_str[42:-14]
    else:
        return "执行出错"
    
    if name == ‘main‘:
    message = tx_sign()
    print(message)
    # send_server('腾讯视频签到通知', message)
    
    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    33
    34
    35
    36
    37
    38
    39
    40
    41
    42
    43
    44
    45
    46
    47
    48
    49
    50
    51
    52
    53
    54
    55
    56
    57
    58
    59
    60
    61

    ## 七、某链家

    - ```python
    # -*- coding:utf-8 -*-
    # @Time :2022/8/24 9:41
    # @SOFTWARE :爬虫

    """
    1.通过网址建立连接获取源码
    2.解析源码 提取数据
    3.进行数据可视化

    """

    import requests
    from lxml import etree
    import pandas as pd
    from pyecharts.charts import Bar # 柱状图



    def spider():
    # 1、获取网页源码
    url = 'https://ty.lianjia.com/ershoufang/'
    headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.5112.102 Safari/537.36 Edg/104.0.1293.63'
    }
    res = requests.get(url=url,headers=headers)

    # 2、数据解析
    html = etree.HTML(res.text)
    div_list = html.xpath('//div[@class = "info clear"]')

    # 3.数据提取
    # 3.1设置空列表存储房屋数据
    second_house = []
    for div in div_list:
    # 1.小区的名称
    position = div.xpath('.//div[@class = "positionInfo"]/a/text()')[0]
    # 2.小区房屋的总价
    total_price = div.xpath('//div[@class = "totalPrice totalPrice2"]/span/text()')[0]
    # 3.小区房屋的单价
    unitPrice = div.xpath('//div[@class = "unitPrice"]/span/text()')[0].replace("元/平","").replace(",","")
    second_house.append([position,float(unitPrice),float(total_price)])

    title = ["小区名称","单价:元/平","总价:万"]
    # 4.将数据保存到本地
    table = pd.DataFrame(data=second_house,columns=title)

    # 5.可视化展示 柱状图
    bar = Bar()
    # 设置 横坐标和 纵坐标
    bar.add_xaxis(list(table["小区名称"]))
    bar.add_yaxis("太原二手房价格",list(table["单价:元/平"]))

    # 写出到HTML
    bar.render("./second_house_TaiYuai.html")

    if __name__ == '__main__':
    spider()