markizano · January 18, 2025 03:33
diff --git a/follow-my-tiktok.py b/follow-my-tiktok.py
 #!/usr/bin/env python3
 '''

 I decided to go with a Python script instead of an Angular app since there are a few challenges I cannot overcome with an Angular app:
 - I don't control the CORS policy of the website I'm scraping
 - I am not guaranteed to be authenticated to the target subscription site I want to pick out.
 - I am not guaranteed to pick the correct data if the CORS policy denies me the ability to scrape the data.

 Sooooo... in light of the recent challenges, this Python script should bypass a lot of that.
 What this will do:

 Usage:

  follow-my-tiktok.py user_data_tiktok.json

 Goto TikTok: Navigate to Settings -> Export data.
 Request a JSON export of the data.

 This script will then parse the JSON to find a list of the users you follow.
 Out of the users, I will fetch the webpage from the respective user.
 From that, I will scrape their BIO.
 From their BIO, I will scrape out any links.
 From the links, I will follow any 301 or 302 redirects and any links in that page.
 I will produce to you a list of links from what I scrapped from the BIO and ONLY 1 layer deep into the links provided from the BIO.
 It will be your responsibility to click on the subscribe link to their respective media pages in order to follow them where you want.

 In summary: Generate a list of links from a given TikTok data export in tree format. So { $username: [ $link1, $link2, ... ] }

 Output to `tiktok-links.json`.

 '''

 import json
 import requests
 import io, sys, os
 import re
 import logging

 logging.basicConfig(level=logging.DEBUG, format='%(asctime)s %(name)s.%(funcName)s(PID=%(process)d %(levelname)-8s) %(message)s')
 log = logging.getLogger('FollowMyTikTok')

 IGNORE_DOMAINS = ('gstatic.com', 'www.w3.org', 'assets.production.linktr.ee')
 USER_AGENT = os.environ.get('UA_BROWSER', 'python-TikTok Helper by @markizano https://gist.github.com/markizano/')
 TIKTOK_OUTFILE = os.environ.get('TIKTOK_OUTFILE', 'tiktok-links.json')

 def load_tiktok_export() -> dict:
    '''
    Load the TikTok export into memory. Return as the data structure.
    '''
    log.info('Loading tiktok user data...')
    return json.load( io.open(sys.argv[1] or os.environ.get('TIKTOK_EXPORT'), 'r') )

 def fetch_bio(username: str) -> str:
    '''
    Fetch the BIO of the user:
    - What will come back is HTML.
    - From the HTML, there will be a script.
    - From the script with id=__UNIVERSAL_DATA_FOR_REHYDRATION__, load a JSON.
    - From the JSON of this script, load the .__DEFAULT_SCOPE__.webapp["app-context"].user.signature of the data structure.

    Return the result of this as the user BIO or empty string.
    '''
    log.info(f'Fetching BIO for {username}...')
    # set the user agent of the request to $UA_BROWSER if set.
    response = requests.get(f"https://www.tiktok.com/@{username}", headers={'User-Agent': USER_AGENT})
    search = re.search(r'<script id="__UNIVERSAL_DATA_FOR_REHYDRATION__" type="application/json">(.*?)</script>', response.text)
    signature = ''
    bioLink = ''
    if search:
        data = json.loads(search.group(1))
        signature = data["__DEFAULT_SCOPE__"]["webapp.user-detail"]["userInfo"]["user"]["signature"]
    if 'bioLink' in data["__DEFAULT_SCOPE__"]["webapp.user-detail"]["userInfo"]["user"]:
        bioLink = 'https://' + data["__DEFAULT_SCOPE__"]["webapp.user-detail"]["userInfo"]["user"]["bioLink"]["link"].lower().lstrip('https://')
    return signature, bioLink

 def scrape_links(bio: str) -> list[str]:
    '''
    Given a bio string, strip out any links they may have in their BIO and return a list of them.
    '''
    result = []
    for link in re.findall(r'(https?://[^\s]+)', bio):
        result.append(link)
    return result

 def follow_redirects(links: list[str]) -> list[str]:
    '''
    Given a list of links, follow any redirects or any links they have.
    Return any redirects or embedded links in the HTML response.
    '''
    result = []
    for link in links:
        log.info(f'Following link {link} ...')
        try:
            response = requests.get(link, headers={'User-Agent': USER_AGENT}, timeout=10)
            if response.status_code in [301, 302]:
                result.append(response.headers['Location'])
            else:
                for link in re.findall(r'(https?://[^\s"\x27]+)', response.text):
                    if any(ignore in link for ignore in IGNORE_DOMAINS):
                        continue
                    result.append(link)
        except requests.exceptions.ConnectTimeout:
            log.error(f'Timeout connecting to {link}.')
            continue
    return result

 def main() -> int:
    '''
    Main application entrypoint.
    Ties all the functions together to produce a list of links to output.
    '''
    export = load_tiktok_export()
    output = {}
    for account in export["Activity"]["Following List"]["Following"]:
        bio, bioLink = fetch_bio(account["UserName"])
        links = scrape_links(bio)
        if bioLink:
            links.append(bioLink)
        redirects = follow_redirects(links)
        output[account["UserName"]] = {
            'bio': bio,
            'links': links,
            'deeplinks': redirects
        }
    log.info(f'Writing output to {TIKTOK_OUTFILE}...')    
    json.dump(output, io.open(TIKTOK_OUTFILE, 'w'))
    log.info('Complete!')
    return 0

 if __name__ == '__main__':
    sys.exit(main())
	#!/usr/bin/env python3
	'''

	I decided to go with a Python script instead of an Angular app since there are a few challenges I cannot overcome with an Angular app:
	- I don't control the CORS policy of the website I'm scraping
	- I am not guaranteed to be authenticated to the target subscription site I want to pick out.
	- I am not guaranteed to pick the correct data if the CORS policy denies me the ability to scrape the data.

	Sooooo... in light of the recent challenges, this Python script should bypass a lot of that.
	What this will do:

	Usage:

	follow-my-tiktok.py user_data_tiktok.json

	Goto TikTok: Navigate to Settings -> Export data.
	Request a JSON export of the data.

	This script will then parse the JSON to find a list of the users you follow.
	Out of the users, I will fetch the webpage from the respective user.
	From that, I will scrape their BIO.
	From their BIO, I will scrape out any links.
	From the links, I will follow any 301 or 302 redirects and any links in that page.
	I will produce to you a list of links from what I scrapped from the BIO and ONLY 1 layer deep into the links provided from the BIO.
	It will be your responsibility to click on the subscribe link to their respective media pages in order to follow them where you want.

	In summary: Generate a list of links from a given TikTok data export in tree format. So { $username: [ $link1, $link2, ... ] }

	Output to `tiktok-links.json`.

	'''

	import json
	import requests
	import io, sys, os
	import re
	import logging

	logging.basicConfig(level=logging.DEBUG, format='%(asctime)s %(name)s.%(funcName)s(PID=%(process)d %(levelname)-8s) %(message)s')
	log = logging.getLogger('FollowMyTikTok')

	IGNORE_DOMAINS = ('gstatic.com', 'www.w3.org', 'assets.production.linktr.ee')
	USER_AGENT = os.environ.get('UA_BROWSER', 'python-TikTok Helper by @markizano https://gist.github.com/markizano/')
	TIKTOK_OUTFILE = os.environ.get('TIKTOK_OUTFILE', 'tiktok-links.json')

	def load_tiktok_export() -> dict:
	'''
	Load the TikTok export into memory. Return as the data structure.
	'''
	log.info('Loading tiktok user data...')
	return json.load( io.open(sys.argv[1] or os.environ.get('TIKTOK_EXPORT'), 'r') )

	def fetch_bio(username: str) -> str:
	'''
	Fetch the BIO of the user:
	- What will come back is HTML.
	- From the HTML, there will be a script.
	- From the script with id=__UNIVERSAL_DATA_FOR_REHYDRATION__, load a JSON.
	- From the JSON of this script, load the .__DEFAULT_SCOPE__.webapp["app-context"].user.signature of the data structure.

	Return the result of this as the user BIO or empty string.
	'''
	log.info(f'Fetching BIO for {username}...')
	# set the user agent of the request to $UA_BROWSER if set.
	response = requests.get(f"https://www.tiktok.com/@{username}", headers={'User-Agent': USER_AGENT})
	search = re.search(r'<script id="__UNIVERSAL_DATA_FOR_REHYDRATION__" type="application/json">(.*?)</script>', response.text)
	signature = ''
	bioLink = ''
	if search:
	data = json.loads(search.group(1))
	signature = data["__DEFAULT_SCOPE__"]["webapp.user-detail"]["userInfo"]["user"]["signature"]
	if 'bioLink' in data["__DEFAULT_SCOPE__"]["webapp.user-detail"]["userInfo"]["user"]:
	bioLink = 'https://' + data["__DEFAULT_SCOPE__"]["webapp.user-detail"]["userInfo"]["user"]["bioLink"]["link"].lower().lstrip('https://')
	return signature, bioLink

	def scrape_links(bio: str) -> list[str]:
	'''
	Given a bio string, strip out any links they may have in their BIO and return a list of them.
	'''
	result = []
	for link in re.findall(r'(https?://[^\s]+)', bio):
	result.append(link)
	return result

	def follow_redirects(links: list[str]) -> list[str]:
	'''
	Given a list of links, follow any redirects or any links they have.
	Return any redirects or embedded links in the HTML response.
	'''
	result = []
	for link in links:
	log.info(f'Following link {link} ...')
	try:
	response = requests.get(link, headers={'User-Agent': USER_AGENT}, timeout=10)
	if response.status_code in [301, 302]:
	result.append(response.headers['Location'])
	else:
	for link in re.findall(r'(https?://[^\s"\x27]+)', response.text):
	if any(ignore in link for ignore in IGNORE_DOMAINS):
	continue
	result.append(link)
	except requests.exceptions.ConnectTimeout:
	log.error(f'Timeout connecting to {link}.')
	continue
	return result

	def main() -> int:
	'''
	Main application entrypoint.
	Ties all the functions together to produce a list of links to output.
	'''
	export = load_tiktok_export()
	output = {}
	for account in export["Activity"]["Following List"]["Following"]:
	bio, bioLink = fetch_bio(account["UserName"])
	links = scrape_links(bio)
	if bioLink:
	links.append(bioLink)
	redirects = follow_redirects(links)
	output[account["UserName"]] = {
	'bio': bio,
	'links': links,
	'deeplinks': redirects
	}
	log.info(f'Writing output to {TIKTOK_OUTFILE}...')
	json.dump(output, io.open(TIKTOK_OUTFILE, 'w'))
	log.info('Complete!')
	return 0

	if __name__ == '__main__':
	sys.exit(main())
No results found