Created
January 18, 2025 03:33
-
-
Save markizano/7f567da1d5facaec68e58996664bcfcc to your computer and use it in GitHub Desktop.
Finds who you follow on TikTok based on your data export and outputs a JSON file with a list of links from their profiles.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python3 | |
| ''' | |
| I decided to go with a Python script instead of an Angular app since there are a few challenges I cannot overcome with an Angular app: | |
| - I don't control the CORS policy of the website I'm scraping | |
| - I am not guaranteed to be authenticated to the target subscription site I want to pick out. | |
| - I am not guaranteed to pick the correct data if the CORS policy denies me the ability to scrape the data. | |
| Sooooo... in light of the recent challenges, this Python script should bypass a lot of that. | |
| What this will do: | |
| Usage: | |
| follow-my-tiktok.py user_data_tiktok.json | |
| Goto TikTok: Navigate to Settings -> Export data. | |
| Request a JSON export of the data. | |
| This script will then parse the JSON to find a list of the users you follow. | |
| Out of the users, I will fetch the webpage from the respective user. | |
| From that, I will scrape their BIO. | |
| From their BIO, I will scrape out any links. | |
| From the links, I will follow any 301 or 302 redirects and any links in that page. | |
| I will produce to you a list of links from what I scrapped from the BIO and ONLY 1 layer deep into the links provided from the BIO. | |
| It will be your responsibility to click on the subscribe link to their respective media pages in order to follow them where you want. | |
| In summary: Generate a list of links from a given TikTok data export in tree format. So { $username: [ $link1, $link2, ... ] } | |
| Output to `tiktok-links.json`. | |
| ''' | |
| import json | |
| import requests | |
| import io, sys, os | |
| import re | |
| import logging | |
| logging.basicConfig(level=logging.DEBUG, format='%(asctime)s %(name)s.%(funcName)s(PID=%(process)d %(levelname)-8s) %(message)s') | |
| log = logging.getLogger('FollowMyTikTok') | |
| IGNORE_DOMAINS = ('gstatic.com', 'www.w3.org', 'assets.production.linktr.ee') | |
| USER_AGENT = os.environ.get('UA_BROWSER', 'python-TikTok Helper by @markizano https://gist.github.com/markizano/') | |
| TIKTOK_OUTFILE = os.environ.get('TIKTOK_OUTFILE', 'tiktok-links.json') | |
| def load_tiktok_export() -> dict: | |
| ''' | |
| Load the TikTok export into memory. Return as the data structure. | |
| ''' | |
| log.info('Loading tiktok user data...') | |
| return json.load( io.open(sys.argv[1] or os.environ.get('TIKTOK_EXPORT'), 'r') ) | |
| def fetch_bio(username: str) -> str: | |
| ''' | |
| Fetch the BIO of the user: | |
| - What will come back is HTML. | |
| - From the HTML, there will be a script. | |
| - From the script with id=__UNIVERSAL_DATA_FOR_REHYDRATION__, load a JSON. | |
| - From the JSON of this script, load the .__DEFAULT_SCOPE__.webapp["app-context"].user.signature of the data structure. | |
| Return the result of this as the user BIO or empty string. | |
| ''' | |
| log.info(f'Fetching BIO for {username}...') | |
| # set the user agent of the request to $UA_BROWSER if set. | |
| response = requests.get(f"https://www.tiktok.com/@{username}", headers={'User-Agent': USER_AGENT}) | |
| search = re.search(r'<script id="__UNIVERSAL_DATA_FOR_REHYDRATION__" type="application/json">(.*?)</script>', response.text) | |
| signature = '' | |
| bioLink = '' | |
| if search: | |
| data = json.loads(search.group(1)) | |
| signature = data["__DEFAULT_SCOPE__"]["webapp.user-detail"]["userInfo"]["user"]["signature"] | |
| if 'bioLink' in data["__DEFAULT_SCOPE__"]["webapp.user-detail"]["userInfo"]["user"]: | |
| bioLink = 'https://' + data["__DEFAULT_SCOPE__"]["webapp.user-detail"]["userInfo"]["user"]["bioLink"]["link"].lower().lstrip('https://') | |
| return signature, bioLink | |
| def scrape_links(bio: str) -> list[str]: | |
| ''' | |
| Given a bio string, strip out any links they may have in their BIO and return a list of them. | |
| ''' | |
| result = [] | |
| for link in re.findall(r'(https?://[^\s]+)', bio): | |
| result.append(link) | |
| return result | |
| def follow_redirects(links: list[str]) -> list[str]: | |
| ''' | |
| Given a list of links, follow any redirects or any links they have. | |
| Return any redirects or embedded links in the HTML response. | |
| ''' | |
| result = [] | |
| for link in links: | |
| log.info(f'Following link {link} ...') | |
| try: | |
| response = requests.get(link, headers={'User-Agent': USER_AGENT}, timeout=10) | |
| if response.status_code in [301, 302]: | |
| result.append(response.headers['Location']) | |
| else: | |
| for link in re.findall(r'(https?://[^\s"\x27]+)', response.text): | |
| if any(ignore in link for ignore in IGNORE_DOMAINS): | |
| continue | |
| result.append(link) | |
| except requests.exceptions.ConnectTimeout: | |
| log.error(f'Timeout connecting to {link}.') | |
| continue | |
| return result | |
| def main() -> int: | |
| ''' | |
| Main application entrypoint. | |
| Ties all the functions together to produce a list of links to output. | |
| ''' | |
| export = load_tiktok_export() | |
| output = {} | |
| for account in export["Activity"]["Following List"]["Following"]: | |
| bio, bioLink = fetch_bio(account["UserName"]) | |
| links = scrape_links(bio) | |
| if bioLink: | |
| links.append(bioLink) | |
| redirects = follow_redirects(links) | |
| output[account["UserName"]] = { | |
| 'bio': bio, | |
| 'links': links, | |
| 'deeplinks': redirects | |
| } | |
| log.info(f'Writing output to {TIKTOK_OUTFILE}...') | |
| json.dump(output, io.open(TIKTOK_OUTFILE, 'w')) | |
| log.info('Complete!') | |
| return 0 | |
| if __name__ == '__main__': | |
| sys.exit(main()) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment