Last active
January 7, 2022 16:23
-
-
Save dEN5-tech/43b31d4b6b779ca1a0e0a74eb4cff78d to your computer and use it in GitHub Desktop.
python | parse yandex images | requests | json | params
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #created dEN5#7360 (DISCORD) | |
| #USE https://curl.trillworks.com/ | |
| import requests | |
| import json | |
| from bs4 import BeautifulSoup as bs | |
| type_img_d= { | |
| "gif":"gifan", | |
| "png":"png", | |
| "jpg":"jpg" | |
| } | |
| type_img_size= { | |
| "Большие":"large", | |
| "Средние":"medium", | |
| "Маленькие":"small" | |
| } | |
| def get_req_img_whith_yandex(query_mn,start_=0,limit=1,type_="choice",add_page = True): | |
| img_size,type_img,recent = False,False,False | |
| headers = { | |
| 'authority': 'yandex.ru', | |
| 'sec-ch-ua': '"Chromium";v="92", " Not A;Brand";v="99", "Google Chrome";v="92"', | |
| 'device-memory': '4', | |
| 'rtt': '250', | |
| 'sec-ch-ua-mobile': '?0', | |
| 'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36', | |
| 'viewport-width': '791', | |
| 'accept': 'text/javascript, application/javascript, application/ecmascript, application/x-ecmascript, */*; q=0.01', | |
| 'x-requested-with': 'XMLHttpRequest', | |
| 'dpr': '1', | |
| 'downlink': '4.6', | |
| 'ect': '4g', | |
| 'sec-fetch-site': 'same-origin', | |
| 'sec-fetch-mode': 'cors', | |
| 'sec-fetch-dest': 'empty', | |
| 'referer': 'https://yandex.ru/images/search?from=tabbar&text=google%20search%20api', | |
| 'accept-language': 'ru-RU,ru;q=0.9,en-US;q=0.8,en;q=0.7,zh-CN;q=0.6,zh;q=0.5' | |
| } | |
| list_links = [] | |
| list_dict = [] | |
| iter = 0 | |
| iter+=start_ | |
| start = time.monotonic() | |
| end = float() | |
| pager = [] | |
| while add_page: | |
| params = [ | |
| ('format', 'json'), | |
| ('request', '{"blocks":[{"block":"extra-content","params":{},"version":2},{"block":"serp-controller","params":{},"version":2},{"block":"serp-list_infinite_yes","params":{"initialPageNum":0},"version":2},{"block":"more_direction_next","params":{},"version":2},{"block":"gallery__items:ajax","params":{},"version":2}],"metadata":{"bundles":{"lb":"jCgK5?b*G$Xvb>:BUOR$"},"assets":{"las":"justifier-height=1;thumb-underlay=1;justifier-setheight=1;fitimages-height=1;justifier-fitincuts=1;react-with-dom=1;ca993f.0=1;d30d05.0=1;105ac6.0=1;bed1df.0=1"},"version":"0x0f74f9d0500","extraContent":{"names":["i-react-ajax-adapter"]}},"bmt":{"lb":"jCgK5?b*G$Xvb>:BUOR$"},"amt":{"las":"justifier-height=1;thumb-underlay=1;justifier-setheight=1;fitimages-height=1;justifier-fitincuts=1;react-with-dom=1;ca993f.0=1;d30d05.0=1;105ac6.0=1;bed1df.0=1"}}'), | |
| ('yu', '1656658121627748886'), | |
| ('p', iter), | |
| ('from', 'tabbar'), | |
| ('text', query_mn), | |
| ('rpt', 'image'), | |
| ('serpid', 'a7tbQ4lJOYyrChOZD000iQ'), | |
| ('serpListType', 'horizontal'), | |
| ('thumbSnippet', '0'), | |
| ] | |
| if type_img: | |
| params.append(("itype",type_img_d[type_img])) | |
| if recent: | |
| params.append(("recent","7D")) | |
| if img_size: | |
| try: | |
| params.append(("isize",type_img_size[img_size])) | |
| except: | |
| size_offset = [("isize","eq"),("iw",img_size[0]),("ih",img_size[1])] | |
| for i in size_offset: | |
| params.append(i) | |
| response = requests.get('https://yandex.ru/images/search', headers=headers, params=params) | |
| json_data = json.dumps(response.text) | |
| json_without_slash = json.loads(json_data) | |
| try: | |
| data_json = json.loads(json_without_slash)["blocks"][2]['html'] | |
| except json.decoder.JSONDecodeError: | |
| break | |
| soup = bs(data_json, 'html.parser') | |
| list_json = soup.find_all("div", class_=re.compile("serp-item serp-item_type_search serp-item_group_search serp-item_pos_.* serp-item_scale_yes justifier__item i-bem")) | |
| list_links_t = [] | |
| for i in list_json: | |
| items = i.get("data-bem") | |
| item = json.loads(items) | |
| serp_item = item["serp-item"] | |
| list_links.append(serp_item["preview"][0]["url"]) | |
| list_links_t.append(serp_item["preview"][0]["url"]) | |
| list_dict.append(serp_item) | |
| if limit>1: | |
| iter+=1 | |
| print(iter) | |
| pager.append({f"{iter}":list_links_t}) | |
| if iter==limit+start_: | |
| print(iter) | |
| end = time.monotonic() | |
| break | |
| print(len(list_links)) | |
| print(len(list_links),end-start) | |
| if type_=="all": | |
| return list_links | |
| if type_=="choice": | |
| return choice(list_links),len(list_links) | |
| if type_=="dic_ch": | |
| return choice(list_dict) | |
| if type_=="p_dict": | |
| return pager |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment