爬取去哪儿网酒店信息
发布日期:2021-06-29 14:37:51 浏览次数:2 分类:技术文章

本文共 4780 字,大约阅读时间需要 15 分钟。

不说太多废话,就简单一句:你们你要爬哪里可以把地点改一下,还有时间改一下,爬取数量自己修改参数和代码,变化不大。有问题请留言,我不再次废话分析(这里我爬取的上海最近的酒店信息)

# coding=utf-8import csv#用来储存文件的模块import timeimport requestsimport jsonimport pandas as pd#excel出处理# 区域店铺id ct_Poi cateName抓取,传入参数为区域iddef crow_id(city):    url = 'https://wxapp.qunar.com/api/hotel/hotellist'#目标网址    headers = {
"wx-v": "", "content-type": "application/json", "Connection": "Keep-Alive", "Accept-Encoding": "gzip", "wx-q": "", "unionid": "ovaMOwE6dQvbGOmZjLLPaGSM5ZtU", "openid": "oIjYJ0TuQcTF_WTWsKcUPR1cRJI0", "wx-t": "", "User-Agent": "Mozilla/5.0 (Linux; Android 6.0.1; OPPO A57 Build/MMB29M; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/55.0.2883.91 Mobile Safari/537.36 MicroMessenger/6.7.2.1340(0x2607023A) NetType/WIFI Language/zh_CN", "charset": "utf-8", "referer": "https://servicewechat.com/wx799d4d93a341b368/114/page-frame.html", "Host": "wxapp.qunar.com", "Cookie": "QN48=tc_437f21c62a765ca0_165c198a408_e56b; QN1=qunar; QN66=smart_app; QN1=O5cv+luWLPthsvB1BKl0Ag==", "Content-Length": "0", } #请求头和cookie p0 = {
'http': 'http://101.132.122.230:3128'} p1 = {
'http': 'http://114.113.126.83:80'} p2 = {
'http': 'http://210.45.123.127:9999'} p3 = {
'http': 'http://118.190.217.182:80'} p4 = {
'http': 'http://120.27.14.125:80'} p5 = {
'http': 'http://118.31.223.194:3128'} p6 = {
'http': 'http://101.37.79.125:3128'} p7 = {
'http': 'http://125.62.26.197:3128'} p8 = {
'http': 'http://218.60.8.98:3129'} p9 = {
'http': 'http://114.215.95.188:3128'} p10 = {
'http': 'http://218.60.8.99:3129'} p11 = {
'http': 'http://218.60.8.83:3129'} p12 = {
'http': 'http://118.190.217.61:80'} p13 = {
'http': 'http://203.86.26.9:3128'} p14 = {
'http': 'http://114.113.126.87:80'} p15 = {
'http': 'http://106.12.32.43:3128'} #爬取不同页网址 p = p1 page = 1 #抓取我们需要的数据 data = {
"city": city, "cityUrl": "", "page": page, "extra": "{}", "sort": "", "keywords": "", "checkOutDate": "2020-10-29", "checkInDate": "2020-10-29", "locationAreaFilter": "", "comprehensiveFilter": "[]", "fixedComprehensiveFilter": "[]", "SDKVersion": "2.2.4", "wxUnionId": "ovaMOwE6dQvbGOmZjLLPaGSM5ZtU", "wxOpenId": "oIjYJ0TuQcTF_WTWsKcUPR1cRJI0", "bd_source": "smart_app", "bd_origin": "pt-onl-ots-ggjd", } r = requests.post(url, headers=headers, params=data, proxies=p) result = json.loads(r.text) pages = result['data']['totalPage'] # pages=586 hotel = result['data'] # attrs = hotel['attrs'] print("当前总页数:",pages) print("Page:%d" %page) print(len(hotel), pages) df = pd.DataFrame(data=hotel['hotels']) df.to_csv('qunaer9.csv', mode='a', header=False) df.drop(df.index, inplace=True) if pages > 1: pages = pages - page page +=1 while pages >=0: data2 = {
"city": city, "cityUrl": "", "page": page, "extra": "{}", "sort": "", "keywords": "", "checkOutDate": "2020-11-2", "checkInDate": "2020-11-1", "locationAreaFilter": "", "comprehensiveFilter": "[]", "fixedComprehensiveFilter": "[]", "SDKVersion": "2.2.4", "wxUnionId": "ovaMOwE6dQvbGOmZjLLPaGSM5ZtU", "wxOpenId": "oIjYJ0TuQcTF_WTWsKcUPR1cRJI0", "bd_source": "smart_app", "bd_origin": "pt-onl-ots-ggjd", } try: r = requests.post(url, headers=headers, params=data2, proxies=p) print(len(hotel), pages) print(page) result = json.loads(r.text) hotel = result['data'] # attrs = hotel['attrs'] df = pd.DataFrame(data=hotel['hotels']) df.to_csv('qunaer9.csv',mode='a',header=False) df.drop(df.index,inplace=True) except Exception as e: print(e) finally: print("Page:%d" %page) pages -= 1 page = page+1 time.sleep(3.1)if __name__ == '__main__': a = {
"areaObj": {
"上海": [{
"city": '上海'}] }} datas = a['areaObj'] b = datas.values() area_list = [] for data in b: for d in data[0:]: area_list.append(d) l = 0 old = time.time() for i in range(len(area_list)): print("开始抓取%s区域:" % (area_list[i]['city'])) crow_id(area_list[i]['city'])

转载地址:https://chuanchuan.blog.csdn.net/article/details/109529610 如侵犯您的版权,请留言回复原文章的地址,我们会给您删除此文章,给您带来不便请您谅解!

上一篇:C++判断素数详细讲解与代码
下一篇:Turtle入门(实例)

发表评论

最新留言

留言是一种美德,欢迎回访!
[***.207.175.100]2024年04月30日 11时16分54秒