Created
January 7, 2022 16:28
-
-
Save dEN5-tech/792da7d6f05232fcb7fd4f52c4064e33 to your computer and use it in GitHub Desktop.
python | parse yandex images | requests | json | params | search by image | search by word
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python | |
| # -*- coding: utf-8 -*- | |
| import requests | |
| import json | |
| import sys | |
| from pprint import pprint | |
| from html_to_json import convert as cnv | |
| from random import randint | |
| from lxml.html.clean import Cleaner | |
| import lxml | |
| import re | |
| from bs4 import BeautifulSoup as bs | |
| from timeit import timeit | |
| import os | |
| from glob import glob | |
| headers = { | |
| 'authority': 'yandex.ru', | |
| 'sec-ch-ua': '" Not A;Brand";v="99", "Chromium";v="96", "Google Chrome";v="96"', | |
| 'device-memory': '8', | |
| 'rtt': '150', | |
| 'sec-ch-ua-mobile': '?0', | |
| 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36', | |
| 'viewport-width': '575', | |
| 'dpr': '1', | |
| 'downlink': '4.15', | |
| 'ect': '4g', | |
| 'sec-ch-ua-platform': '"Windows"', | |
| 'accept': 'application/json', | |
| 'sec-fetch-site': 'same-origin', | |
| 'sec-fetch-mode': 'cors', | |
| 'sec-fetch-dest': 'empty', | |
| 'accept-language': 'ru-RU,ru;q=0.9,en-US;q=0.8,en;q=0.7,zh-CN;q=0.6,zh;q=0.5' | |
| } | |
| def info(id): | |
| params = ( | |
| ('docid', f'{id}'), | |
| ('lang', 'ru'), | |
| ('mt', '1'), | |
| ('family', '0'), | |
| ('pornowhitelist', '1'), | |
| ('ipnd', '1'), | |
| ) | |
| response = requests.get('https://yandex.ru/images-apphost/rim', | |
| headers=headers, params=params).json() | |
| return response | |
| def load_image(byte): | |
| headers = { | |
| 'authority': 'yandex.ru', | |
| 'sec-ch-ua': '" Not A;Brand";v="99", "Chromium";v="96", "Google Chrome";v="96"', | |
| 'device-memory': '8', | |
| 'rtt': '200', | |
| 'sec-ch-ua-mobile': '?0', | |
| 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36', | |
| 'viewport-width': '794', | |
| 'content-type': 'image/jpeg', | |
| 'dpr': '1', | |
| 'downlink': '2.65', | |
| 'ect': '4g', | |
| 'sec-ch-ua-platform': '"Windows"', | |
| 'accept': '*/*', | |
| 'origin': 'https://yandex.ru', | |
| 'sec-fetch-site': 'same-origin', | |
| 'sec-fetch-mode': 'cors', | |
| 'sec-fetch-dest': 'empty', | |
| 'accept-language': 'ru-RU,ru;q=0.9,en-US;q=0.8,en;q=0.7,zh-CN;q=0.6,zh;q=0.5' | |
| } | |
| params = ( | |
| ('cbird', '37'), | |
| ('images_avatars_size', 'preview'), | |
| ('images_avatars_namespace', 'images-cbir') | |
| ) | |
| data = byte | |
| response = requests.post('https://yandex.ru/images-apphost/image-download', | |
| headers=headers, params=params, data=data).json() | |
| print(response) | |
| return response | |
| def getInfoImage(url): | |
| headers = { | |
| 'authority': 'yandex.ru', | |
| 'cache-control': 'max-age=0', | |
| 'device-memory': '8', | |
| 'dpr': '1', | |
| 'viewport-width': '1280', | |
| 'rtt': '200', | |
| 'downlink': '2.2', | |
| 'ect': '4g', | |
| 'sec-ch-ua': '" Not A;Brand";v="99", "Chromium";v="96", "Google Chrome";v="96"', | |
| 'sec-ch-ua-mobile': '?0', | |
| 'sec-ch-ua-platform': '"Windows"', | |
| 'upgrade-insecure-requests': '1', | |
| 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36', | |
| 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', | |
| 'sec-fetch-site': 'same-origin', | |
| 'sec-fetch-mode': 'navigate', | |
| 'sec-fetch-user': '?1', | |
| 'sec-fetch-dest': 'document', | |
| 'accept-language': 'ru-RU,ru;q=0.9,en-US;q=0.8,en;q=0.7,zh-CN;q=0.6,zh;q=0.5' | |
| } | |
| if isinstance(dict, type(url)): | |
| params = ( | |
| ('url', "".join(url['url'].split("/")[:-2]) + "orig"), | |
| ('cbir_id', url['url'].split("get-images-cbir/") | |
| [-1].split("/preview")[0]), | |
| ('cbir_page', 'similar'), | |
| ('rpt', 'imageview'), | |
| ('family', '0'), | |
| ('pornowhitelist', '1'), | |
| ('ipnd', '1'), | |
| ) | |
| elif not url.startswith("http"): | |
| params = ( | |
| ('text', url), | |
| ('from', 'tabbar'), | |
| ('family', '0'), | |
| ('pornowhitelist', '1'), | |
| ('ipnd', '1'), | |
| ) | |
| else: | |
| params = ( | |
| ('url', url), | |
| ('cbir_page', 'similar'), | |
| ('rpt', 'imageview'), | |
| ('family', '0'), | |
| ('pornowhitelist', '1'), | |
| ('ipnd', '1'), | |
| ) | |
| response = requests.get('https://yandex.ru/images/search', | |
| headers=headers, params=params) | |
| print(response.url) | |
| root = lxml.html.fromstring(response.content) | |
| data = list(root.xpath('//*[@id]/@data-bem')) | |
| for i in data: | |
| i = json.loads(i) | |
| if "serp-item" in i: | |
| if "rimId" in i["serp-item"]: | |
| yield i["serp-item"]["rimId"] | |
| def sJson(response, name): | |
| with open(f"{name}.html", "w", encoding="utf-8")as f: | |
| cleaner = Cleaner(style=True, scripts=True, javascript=True, inline_style=True, links=True, add_nofollow=False, | |
| page_structure=True, safe_attrs_only=False) | |
| f.write(cleaner.clean_html(response)) | |
| all_links = [] | |
| def vldc(elem): | |
| try: | |
| requests.get(elem) | |
| except: | |
| return False | |
| def map_append(elem): | |
| all_links.append(elem["iu"]) | |
| def get_from_dict(all_links, response): | |
| for num, i in enumerate(response["rld"]): | |
| infos = i["s"] | |
| map(map_append, infos) | |
| def Glob_matching(src): | |
| prt = glob("*.*") | |
| if src in prt: | |
| return True | |
| def links_yd(uri): | |
| if isinstance(str, type(uri)) and uri.startswith("C:") or Glob_matching(uri): | |
| with open(uri, "rb") as image: | |
| f = image.read() | |
| try: | |
| response = info(getInfoImage(load_image(f))) | |
| for l in response: | |
| for num, i in enumerate(response["rld"]): | |
| infos = i["s"] | |
| for i in infos: | |
| all_links.append(i["iu"]) | |
| if "id" in i: | |
| get_from_dict(all_links, info(i["id"])) | |
| return all_links | |
| except: | |
| return None | |
| elif isinstance(bytes, type(uri)): | |
| f = uri | |
| response = info(getInfoImage(load_image(f))) | |
| for l in response: | |
| for num, i in enumerate(response["rld"]): | |
| infos = i["s"] | |
| for i in infos: | |
| all_links.append(i["iu"]) | |
| if "id" in i: | |
| get_from_dict(all_links, info(i["id"])) | |
| return all_links | |
| else: | |
| response = map(info,getInfoImage(uri)) | |
| for l in response: | |
| for num, i in enumerate(l["rld"]): | |
| infos = i["s"] | |
| for i in infos: | |
| all_links.append(i["iu"]) | |
| if "id" in i: | |
| get_from_dict(all_links, info(i["id"])) | |
| return all_links | |
| print(links_yd("cats")) |
Author
спасибо, очень помогло разобраться с пост запросом яндекса
Пожалуйста
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
спасибо, очень помогло разобраться с пост запросом яндекса