Skip to content

Instantly share code, notes, and snippets.

@dEN5-tech
Created January 7, 2022 16:28
Show Gist options
  • Select an option

  • Save dEN5-tech/792da7d6f05232fcb7fd4f52c4064e33 to your computer and use it in GitHub Desktop.

Select an option

Save dEN5-tech/792da7d6f05232fcb7fd4f52c4064e33 to your computer and use it in GitHub Desktop.
python | parse yandex images | requests | json | params | search by image | search by word
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import requests
import json
import sys
from pprint import pprint
from html_to_json import convert as cnv
from random import randint
from lxml.html.clean import Cleaner
import lxml
import re
from bs4 import BeautifulSoup as bs
from timeit import timeit
import os
from glob import glob
headers = {
'authority': 'yandex.ru',
'sec-ch-ua': '" Not A;Brand";v="99", "Chromium";v="96", "Google Chrome";v="96"',
'device-memory': '8',
'rtt': '150',
'sec-ch-ua-mobile': '?0',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36',
'viewport-width': '575',
'dpr': '1',
'downlink': '4.15',
'ect': '4g',
'sec-ch-ua-platform': '"Windows"',
'accept': 'application/json',
'sec-fetch-site': 'same-origin',
'sec-fetch-mode': 'cors',
'sec-fetch-dest': 'empty',
'accept-language': 'ru-RU,ru;q=0.9,en-US;q=0.8,en;q=0.7,zh-CN;q=0.6,zh;q=0.5'
}
def info(id):
params = (
('docid', f'{id}'),
('lang', 'ru'),
('mt', '1'),
('family', '0'),
('pornowhitelist', '1'),
('ipnd', '1'),
)
response = requests.get('https://yandex.ru/images-apphost/rim',
headers=headers, params=params).json()
return response
def load_image(byte):
headers = {
'authority': 'yandex.ru',
'sec-ch-ua': '" Not A;Brand";v="99", "Chromium";v="96", "Google Chrome";v="96"',
'device-memory': '8',
'rtt': '200',
'sec-ch-ua-mobile': '?0',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36',
'viewport-width': '794',
'content-type': 'image/jpeg',
'dpr': '1',
'downlink': '2.65',
'ect': '4g',
'sec-ch-ua-platform': '"Windows"',
'accept': '*/*',
'origin': 'https://yandex.ru',
'sec-fetch-site': 'same-origin',
'sec-fetch-mode': 'cors',
'sec-fetch-dest': 'empty',
'accept-language': 'ru-RU,ru;q=0.9,en-US;q=0.8,en;q=0.7,zh-CN;q=0.6,zh;q=0.5'
}
params = (
('cbird', '37'),
('images_avatars_size', 'preview'),
('images_avatars_namespace', 'images-cbir')
)
data = byte
response = requests.post('https://yandex.ru/images-apphost/image-download',
headers=headers, params=params, data=data).json()
print(response)
return response
def getInfoImage(url):
headers = {
'authority': 'yandex.ru',
'cache-control': 'max-age=0',
'device-memory': '8',
'dpr': '1',
'viewport-width': '1280',
'rtt': '200',
'downlink': '2.2',
'ect': '4g',
'sec-ch-ua': '" Not A;Brand";v="99", "Chromium";v="96", "Google Chrome";v="96"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'sec-fetch-site': 'same-origin',
'sec-fetch-mode': 'navigate',
'sec-fetch-user': '?1',
'sec-fetch-dest': 'document',
'accept-language': 'ru-RU,ru;q=0.9,en-US;q=0.8,en;q=0.7,zh-CN;q=0.6,zh;q=0.5'
}
if isinstance(dict, type(url)):
params = (
('url', "".join(url['url'].split("/")[:-2]) + "orig"),
('cbir_id', url['url'].split("get-images-cbir/")
[-1].split("/preview")[0]),
('cbir_page', 'similar'),
('rpt', 'imageview'),
('family', '0'),
('pornowhitelist', '1'),
('ipnd', '1'),
)
elif not url.startswith("http"):
params = (
('text', url),
('from', 'tabbar'),
('family', '0'),
('pornowhitelist', '1'),
('ipnd', '1'),
)
else:
params = (
('url', url),
('cbir_page', 'similar'),
('rpt', 'imageview'),
('family', '0'),
('pornowhitelist', '1'),
('ipnd', '1'),
)
response = requests.get('https://yandex.ru/images/search',
headers=headers, params=params)
print(response.url)
root = lxml.html.fromstring(response.content)
data = list(root.xpath('//*[@id]/@data-bem'))
for i in data:
i = json.loads(i)
if "serp-item" in i:
if "rimId" in i["serp-item"]:
yield i["serp-item"]["rimId"]
def sJson(response, name):
with open(f"{name}.html", "w", encoding="utf-8")as f:
cleaner = Cleaner(style=True, scripts=True, javascript=True, inline_style=True, links=True, add_nofollow=False,
page_structure=True, safe_attrs_only=False)
f.write(cleaner.clean_html(response))
all_links = []
def vldc(elem):
try:
requests.get(elem)
except:
return False
def map_append(elem):
all_links.append(elem["iu"])
def get_from_dict(all_links, response):
for num, i in enumerate(response["rld"]):
infos = i["s"]
map(map_append, infos)
def Glob_matching(src):
prt = glob("*.*")
if src in prt:
return True
def links_yd(uri):
if isinstance(str, type(uri)) and uri.startswith("C:") or Glob_matching(uri):
with open(uri, "rb") as image:
f = image.read()
try:
response = info(getInfoImage(load_image(f)))
for l in response:
for num, i in enumerate(response["rld"]):
infos = i["s"]
for i in infos:
all_links.append(i["iu"])
if "id" in i:
get_from_dict(all_links, info(i["id"]))
return all_links
except:
return None
elif isinstance(bytes, type(uri)):
f = uri
response = info(getInfoImage(load_image(f)))
for l in response:
for num, i in enumerate(response["rld"]):
infos = i["s"]
for i in infos:
all_links.append(i["iu"])
if "id" in i:
get_from_dict(all_links, info(i["id"]))
return all_links
else:
response = map(info,getInfoImage(uri))
for l in response:
for num, i in enumerate(l["rld"]):
infos = i["s"]
for i in infos:
all_links.append(i["iu"])
if "id" in i:
get_from_dict(all_links, info(i["id"]))
return all_links
print(links_yd("cats"))
@RomanAfn
Copy link

спасибо, очень помогло разобраться с пост запросом яндекса

@dEN5-tech
Copy link
Author

dEN5-tech commented Jul 14, 2023

спасибо, очень помогло разобраться с пост запросом яндекса

Пожалуйста

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment