更新时间:2019年07月26日 10时50分33秒 来源:黑马程序员论坛
直接介绍一下具体的步骤以及注意点: instagram 爬虫注意点
给出能运行的代码?(设置了 FQ 代理,不需要的可以去掉喔): # -*- coding:utf-8 -*-import requestsimport reimport jsonimport urllib.parseimport hashlibimport sysUSER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'BASE_URL = 'https://www.instagram.com'ACCOUNT_MEDIAS = "http://www.smpeizi.com/graphql/query/?query_hash=42323d64886122307be10013ad2dcc44&variables=%s"ACCOUNT_PAGE = 'https://www.pzzs168.com/%s'proxies = { 'http': 'http://127.0.0.1:1087', 'https': 'http://127.0.0.1:1087',}# 一次设置proxy的办法,将它设置在一次session会话中,这样就不用每次都在调用requests的时候指定proxies参数了# s = requests.session()# s.proxies = {'http': '121.193.143.249:80'}def get_shared_data(html=''): """get window._sharedData from page,return the dict loaded by window._sharedData str """ if html: target_text = html else: header = generate_header() response = requests.get(BASE_URL, proxies=proxies, headers=header) target_text = response.text regx = r"\s*.*\s*<script.*?>.*_sharedData\s*=\s*(.*?);<\/script>" match_result = re.match(regx, target_text, re.S) data = json.loads(match_result.group(1)) return data# def get_rhx_gis():# """get the rhx_gis value from sharedData# """# share_data = get_shared_data()# return share_data['rhx_gis']def get_account(user_name): """get the account info by username :param user_name: :return: """ url = get_account_link(user_name) header = generate_header() response = requests.get(url, headers=header, proxies=proxies) data = get_shared_data(response.text) account = resolve_account_data(data) return accountdef get_media_by_user_id(user_id, count=50, max_id=''): """get media info by user id :param id: :param count: :param max_id: :return: """ index = 0 medias = [ has_next_page = True while index <= count and has_next_page: varibles = json.dumps({ 'id': str(user_id), 'first': count, 'after': str(max_id) }, separators=(',', ':')) # 不指定separators的话key:value的:后会默认有空格,因为其默认separators为(', ', ': ') url = get_account_media_link(varibles) header = generate_header() response = requests.get(url, headers=header, proxies=proxies) media_json_data = json.loads(response.text) media_raw_data = media_json_data['data'['user'['edge_owner_to_timeline_media'['edges' if not media_raw_data: return medias for item in media_raw_data: if index == count: return medias index += 1 medias.append(general_resolve_media(item['node')) max_id = media_json_data['data'['user'['edge_owner_to_timeline_media'['page_info'['end_cursor' has_next_page = media_json_data['data'['user'['edge_owner_to_timeline_media'['page_info'['has_next_page' return mediasdef get_media_by_url(media_url): response = requests.get(get_media_url(media_url), proxies=proxies, headers=generate_header()) media_json = json.loads(response.text) return general_resolve_media(media_json['graphql'['shortcode_media')def get_account_media_link(varibles): return ACCOUNT_MEDIAS % urllib.parse.quote(varibles)def get_account_link(user_name): return ACCOUNT_PAGE % user_namedef get_media_url(media_url): return media_url.rstrip('/') + '/?__a=1'# def generate_instagram_gis(varibles):# rhx_gis = get_rhx_gis()# gis_token = rhx_gis + ':' + varibles# x_instagram_token = hashlib.md5(gis_token.encode('utf-8')).hexdigest()# return x_instagram_tokendef generate_header(gis_token=''): # todo: if have session, add the session key:value to header header = { 'user-agent': USER_AGENT, } if gis_token: header['x-instagram-gis' = gis_token return headerdef general_resolve_media(media): res = { 'id': media['id', 'type': media['__typename'[5:.lower(), 'content': media['edge_media_to_caption'['edges'[0['node'['text', 'title': 'title' in media and media['title' or '', 'shortcode': media['shortcode', 'preview_url': BASE_URL + '/p/' + media['shortcode', 'comments_count': media['edge_media_to_comment'['count', 'likes_count': media['edge_media_preview_like'['count', 'dimensions': 'dimensions' in media and media['dimensions' or {}, 'display_url': media['display_url', 'owner_id': media['owner'['id', 'thumbnail_src': 'thumbnail_src' in media and media['thumbnail_src' or '', 'is_video': media['is_video', 'video_url': 'video_url' in media and media['video_url' or '' } return resdef resolve_account_data(account_data): account = { 'country': account_data['country_code', 'language': account_data['language_code', 'biography': account_data['entry_data'['ProfilePage'[0['graphql'['user'['biography', 'followers_count': account_data['entry_data'['ProfilePage'[0['graphql'['user'['edge_followed_by'['count', 'follow_count': account_data['entry_data'['ProfilePage'[0['graphql'['user'['edge_follow'['count', 'full_name': account_data['entry_data'['ProfilePage'[0['graphql'['user'['full_name', 'id': account_data['entry_data'['ProfilePage'[0['graphql'['user'['id', 'is_private': account_data['entry_data'['ProfilePage'[0['graphql'['user'['is_private', 'is_verified': account_data['entry_data'['ProfilePage'[0['graphql'['user'['is_verified', 'profile_pic_url': account_data['entry_data'['ProfilePage'[0['graphql'['user'['profile_pic_url_hd', 'username': account_data['entry_data'['ProfilePage'[0['graphql'['user'['username', } return accountaccount = get_account('shaq')result = get_media_by_user_id(account['id', 56)media = get_media_by_url('https://www.idiancai.com/p/Bw3-Q2XhDMf/')print(len(result))print(result)封装成库了! 除此以外,为了方便我写了一个库放在了 github 上,里面包含了很多操作,希望大家能看一下给点建议。如果对你有用的话,欢迎 star 和 PR~ 感谢泥萌!! |
推荐了解热门学科
java培训 | Python人工智能 | Web前端培训 | PHP培训 |
区块链培训 | 影视制作培训 | C++培训 | 产品经理培训 |
UI设计培训 | 新媒体培训 | 产品经理培训 | Linux运维 |
大数据培训 | 智能机器人软件开发 |
传智播客是一家致力于培养高素质软件开发人才的科技公司,“黑马程序员”是传智播客旗下高端IT教育品牌。自“黑马程序员”成立以来,教学研发团队一直致力于打造精品课程资源,不断在产、学、研3个层面创新自己的执教理念与教学方针,并集中“黑马程序员”的优势力量,针对性地出版了计算机系列教材50多册,制作教学视频数+套,发表各类技术文章数百篇。
传智播客从未停止思考
传智播客副总裁毕向东在2019IT培训行业变革大会提到,“传智播客意识到企业的用人需求已经从初级程序员升级到中高级程序员,具备多领域、多行业项目经验的人才成为企业用人的首选。”
中级程序员和初级程序员的差别在哪里?
项目经验。毕向东表示,“中级程序员和初级程序员最大的差别在于中级程序员比初级程序员多了三四年的工作经验,从而多出了更多的项目经验。“为此,传智播客研究院引进曾在知名IT企业如阿里、IBM就职的高级技术专家,集中研发面向中高级程序员的课程,用以满足企业用人需求,尽快补全IT行业所需的人才缺口。
何为中高级程序员课程?
传智播客进行了定义。中高级程序员课程,是在当前主流的初级程序员课程的基础上,增加多领域多行业的含金量项目,从技术的广度和深度上进行拓展。“我们希望用5年的时间,打造上百个高含金量的项目,覆盖主流的32个行业。”传智播客课程研发总监于洋表示。
黑马程序员热门视频教程【点击播放】
Python入门教程完整版(懂中文就能学会) | 零起点打开Java世界的大门 |
C++| 匠心之作 从0到1入门学编程 | PHP|零基础入门开发者编程核心技术 |
Web前端入门教程_Web前端html+css+JavaScript | 软件测试入门到精通 |