Last active
July 4, 2018 16:41
-
-
Save Dbyrum/09c395f9584ddc3a45957721f5a6915c to your computer and use it in GitHub Desktop.
first task
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "cells": [ | |
| { | |
| "cell_type": "code", | |
| "execution_count": 2, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "# Import libraries up front\n", | |
| "import json\n", | |
| "\n", | |
| "# From Table S13 in Plaisier et al., Cell Systems 2016\n", | |
| "# These are Entrez IDs (https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3013746/)\n", | |
| "input = ['430', '1052', '1053', '1385', '84699', '9586', '1871', '1874', '144455', '79733', '1960', '1997', '2002', '2004', '80712', '2114', '2115', '2120', '51513', '2551', '2623', '2624', '2625', '9421', '3232', '10320', '3659', '3662', '3670', '91464', '3726', '10661', '11278', '128209', '10365', '9314', '1316', '51176', '9935', '23269', '4602', '4774', '4790', '7025', '9480', '5468', '5914', '5916', '3516', '5971', '864', '6257', '4093', '6659', '6660', '6662', '25803', '347853', '30009', '9496', '6929', '6925', '8463', '7022', '29842', '10155', '6935', '132625', '23051', '85416', '7707', '7764', '23528', '201516']\n", | |
| "\n", | |
| "# Loading JSON file\n", | |
| "# https://www.safaribooksonline.com/library/view/python-cookbook-3rd/9781449357337/ch06s02.html\n", | |
| "# Example:\n", | |
| "# import json\n", | |
| "#\n", | |
| "# # Reading data back\n", | |
| "# with open('data.json', 'r') as f:\n", | |
| "# data = json.load(f)\n", | |
| "\n", | |
| "# Reading TF regulator to TF target gene relationships into Python\n", | |
| "# The json library we import takes care of most of the work\n", | |
| "with open('tfbsDb_plus_and_minus_5000_entrez.json', 'r') as f:\n", | |
| " tfbsDb = json.load(f)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 3, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "['SOX10_HMG_full_dimeric_16_1', 'V_AP2ALPHA_01_M00469', 'V_SIX6_01_M01345', 'Pitx1.1', 'ELF1_ETS_full_monomeric_12_1']\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "# Example set of keys in tfbsDb, they are Motif IDs (http://jaspar.genereg.net/search?q=Homo%20sapiens&collection=CORE&tax_group=vertebrates)\n", | |
| "print(list(tfbsDb.keys())[0:5])" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 4, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "['10', '100131211', '100288797', '100302736', '10057']\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "# Example set of values under a specific Motif ID, they are Entrez IDs\n", | |
| "print(tfbsDb[list(tfbsDb.keys())[1]][0:5])" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 5, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "1185\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "print(len(tfbsDb[list(tfbsDb.keys())[0]]))" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 6, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "['Motif Name', 'Gene Symbol', 'Entrez ID']\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "# Read in humanTFs file\n", | |
| " \n", | |
| "id2motif = {}\n", | |
| "motif2id = {}\n", | |
| "with open('id_conversion/humanTFs_All.csv','r') as inFile:\n", | |
| " # Use the readline() function to read in a single line\n", | |
| " # strip() gets rid of the newline character at the end of the line\n", | |
| " # split(',') splits up the line into columns based on commas\n", | |
| " header = inFile.readline().strip().split(',')\n", | |
| " print (header)\n", | |
| " while 1:\n", | |
| " inLine = inFile.readline()\n", | |
| " if not inLine:\n", | |
| " break\n", | |
| " split = inLine.strip().split(',') \n", | |
| " \n", | |
| " # TODO Fill out the id2motif dictionary (key = Entrez ID, value = Motif Name)\n", | |
| " # if split[2]\n", | |
| " \n", | |
| " if not split[2] in id2motif:\n", | |
| " id2motif[split[2]] = []\n", | |
| " id2motif[split[2]].append(split[0]) \n", | |
| " # TODO Fill out the motif2id dictionary (key = Motif Name, value = Entrez ID)\n", | |
| " motif2id[split[0]]=split[2]\n", | |
| " \n", | |
| " \n", | |
| " \n", | |
| "\n", | |
| "#print(len(motif2id))\n", | |
| "#print(len(id2motif.keys()))\n", | |
| "#print(id2motif)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 22, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "\n", | |
| "## To build a TF regulator to TF target gene network (constrained to TFs within the input list).\n", | |
| "## This will require mapping from:\n", | |
| "## 1. Input list of potential TF regulator Entrez Gene IDs (input) \n", | |
| "## 2. List of Motif IDs for an Entrez Gene ID in the input list (either id2motif or motif2id)\n", | |
| "## 3. TF target genes that are Entrez Gene IDs that are the values under a specific Motif ID in tfbsDb\n", | |
| "## 4. Restrict TF target genes to only those in the input list\n", | |
| "## 5. Add new entry to tfNetwor dictionary that has as the key the TF regulator and the values all the TF target genes\n", | |
| "tfNetwork = {}\n", | |
| "\n", | |
| "for eachTfReg in input: # for loop that assigns each iteration to eachTfReg\n", | |
| " if eachTfReg in id2motif:\n", | |
| " for eachMotif in id2motif[eachTfReg]: # loop function that checks motif2id in id2motif[eachTfReg]\n", | |
| " if eachMotif in tfbsDb:\n", | |
| " targets = tfbsDb[eachMotif] # assign targets from id2motif[eachTfReg]/eachTfreg\n", | |
| " \n", | |
| " for eachTarget in targets:\n", | |
| " if not eachTfReg in tfNetwork:\n", | |
| " tfNetwork[eachTfReg] = []\n", | |
| " \n", | |
| " if eachTarget in input and not eachTarget in tfNetwork[eachTfReg]:\n", | |
| " tfNetwork[eachTfReg].append(eachTarget)\n", | |
| " \n", | |
| "#print (tfNetwork)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 31, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "12\n", | |
| "['84699', '9421', '3726', '128209', '10365', '1316', '5971', '4093', '347853', '8463', '23051', '85416']\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "l1 = []\n", | |
| "for eachTfReg in input:\n", | |
| " if not eachTfReg in tfNetwork:\n", | |
| " #print (eachTfReg)\n", | |
| " l1.append(eachTfReg)\n", | |
| " \n", | |
| "print(len(l1))\n", | |
| "print(l1)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 23, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "#sorted(id2motif.keys())" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 41, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "['Ascl2.1',\n", | |
| " 'Ascl2.2',\n", | |
| " 'Ascl2_bHLH_DBD_dimeric_10_1',\n", | |
| " 'V_ASCL2_03_M02737',\n", | |
| " 'V_ASCL2_04_M02841']" | |
| ] | |
| }, | |
| "execution_count": 41, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "#motif2id['430']\n", | |
| "id2motif['430']" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 42, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "'430'" | |
| ] | |
| }, | |
| "execution_count": 42, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "motif2id['Ascl2.1']" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 43, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "74\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "print (len(input))" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 44, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "['142', '190', '196', '257', '326']\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "print (list(id2motif.keys())[0:5])" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 45, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "False" | |
| ] | |
| }, | |
| "execution_count": 45, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "'142' in tfbsDb['Ascl2.1']" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 39, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "#print(tfbsDb.keys())" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 41, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "#for each in input:\n", | |
| " # print (each)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 25, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "62" | |
| ] | |
| }, | |
| "execution_count": 25, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "len(tfNetwork)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [] | |
| } | |
| ], | |
| "metadata": { | |
| "kernelspec": { | |
| "display_name": "Python 3", | |
| "language": "python", | |
| "name": "python3" | |
| }, | |
| "language_info": { | |
| "codemirror_mode": { | |
| "name": "ipython", | |
| "version": 3 | |
| }, | |
| "file_extension": ".py", | |
| "mimetype": "text/x-python", | |
| "name": "python", | |
| "nbconvert_exporter": "python", | |
| "pygments_lexer": "ipython3", | |
| "version": "3.6.4" | |
| } | |
| }, | |
| "nbformat": 4, | |
| "nbformat_minor": 2 | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment