Skip to content

Instantly share code, notes, and snippets.

@travc
Last active March 19, 2020 06:49
Show Gist options
  • Select an option

  • Save travc/cf415816d144f04b87d68b655bf5bbad to your computer and use it in GitHub Desktop.

Select an option

Save travc/cf415816d144f04b87d68b655bf5bbad to your computer and use it in GitHub Desktop.
count doubletons
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"ExecuteTime": {
"end_time": "2020-03-19T06:47:10.727829Z",
"start_time": "2020-03-19T06:47:09.615054Z"
}
},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"import subprocess"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"ExecuteTime": {
"end_time": "2020-03-19T06:47:10.735881Z",
"start_time": "2020-03-19T06:47:10.731669Z"
}
},
"outputs": [],
"source": [
"FN = '/share/lanzarolab/users/mdelima/f2variants/chro3L_doubletons-flt-01'"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"ExecuteTime": {
"end_time": "2020-03-19T06:47:55.693502Z",
"start_time": "2020-03-19T06:47:10.739440Z"
}
},
"outputs": [],
"source": [
"# use numpy's function to read the file\n",
"# very long rows are evil... pandas will be very very slow, and numpy will just be slow (~50 sec)\n",
"# downside of numpy is that it doesn't read strings (the first row of sample names)\n",
"dat = np.genfromtxt(FN)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"ExecuteTime": {
"end_time": "2020-03-19T06:47:55.709243Z",
"start_time": "2020-03-19T06:47:55.698390Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(77, 374536)\n",
"[[nan 0. 0. 0. 0. 0. 0. 0. 0. 0.]\n",
" [nan 0. 0. 0. 0. 0. 0. 0. 0. 0.]\n",
" [nan 0. 0. 0. 0. 0. 0. 0. 1. 0.]\n",
" [nan 0. 0. 0. 0. 0. 0. 0. 0. 0.]\n",
" [nan 0. 0. 0. 0. 0. 0. 0. 0. 0.]]\n"
]
}
],
"source": [
"# just taking a look at the data\n",
"print(dat.shape)\n",
"print(dat[0:5,0:10])"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"ExecuteTime": {
"end_time": "2020-03-19T06:47:56.119257Z",
"start_time": "2020-03-19T06:47:55.712303Z"
}
},
"outputs": [],
"source": [
"# read just the sample names now\n",
"# we are actually using the bash command 'cut' to efficiently grab the first column\n",
"samples = subprocess.check_output(['cut','-f','1',FN], universal_newlines=True).split()\n",
"assert len(samples) == dat.shape[0] # makes sure the number of rows are the same"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"ExecuteTime": {
"end_time": "2020-03-19T06:48:01.467123Z",
"start_time": "2020-03-19T06:47:56.124399Z"
}
},
"outputs": [],
"source": [
"out = np.full((len(samples),len(samples)), np.nan) # empty array (of nan values) to store results\n",
"# for each pair...\n",
"for i in range(len(samples)):\n",
" for j in range(i+1):\n",
" if i == j: # the diagonal... lets output the number of heterozygous sites\n",
" out[i,i] = dat[i,1:].sum()\n",
" else: # a pairwise comparison\n",
" # we are using the trick that 0=False and anything else is True\n",
" # and then the trick that True=1 (so sum of boolean array is # of True)\n",
" # BTW: The `1:` range is to skip the first column (was sample names)\n",
" # BTW2: `nan` values evaluate to True, which might be a problem if you have missing data\n",
" out[i,j] = np.logical_and(dat[i,1:], dat[j,1:]).sum()"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"ExecuteTime": {
"end_time": "2020-03-19T06:48:01.570448Z",
"start_time": "2020-03-19T06:48:01.472166Z"
}
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>98SAOT041</th>\n",
" <th>98SAOT027</th>\n",
" <th>98SAOT018</th>\n",
" <th>98SAOT016</th>\n",
" <th>98SAOT002</th>\n",
" <th>98PRIN006</th>\n",
" <th>8017_D</th>\n",
" <th>98PRIN026</th>\n",
" <th>6617_P</th>\n",
" <th>6617_B</th>\n",
" <th>...</th>\n",
" <th>11TIKO391</th>\n",
" <th>98PRIN046</th>\n",
" <th>12SELI0021</th>\n",
" <th>0117_J</th>\n",
" <th>11TIKO403</th>\n",
" <th>12SELI0072</th>\n",
" <th>2014ABO003</th>\n",
" <th>2014COV001</th>\n",
" <th>2014COV002</th>\n",
" <th>2014COV006</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>98SAOT041</th>\n",
" <td>2104.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>98SAOT027</th>\n",
" <td>60.0</td>\n",
" <td>2005.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>98SAOT018</th>\n",
" <td>282.0</td>\n",
" <td>87.0</td>\n",
" <td>2173.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>98SAOT016</th>\n",
" <td>111.0</td>\n",
" <td>92.0</td>\n",
" <td>145.0</td>\n",
" <td>1842.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>98SAOT002</th>\n",
" <td>185.0</td>\n",
" <td>142.0</td>\n",
" <td>99.0</td>\n",
" <td>123.0</td>\n",
" <td>2031.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12SELI0072</th>\n",
" <td>27.0</td>\n",
" <td>25.0</td>\n",
" <td>25.0</td>\n",
" <td>9.0</td>\n",
" <td>36.0</td>\n",
" <td>20.0</td>\n",
" <td>24.0</td>\n",
" <td>43.0</td>\n",
" <td>16.0</td>\n",
" <td>32.0</td>\n",
" <td>...</td>\n",
" <td>295.0</td>\n",
" <td>36.0</td>\n",
" <td>874.0</td>\n",
" <td>30.0</td>\n",
" <td>144.0</td>\n",
" <td>19488.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2014ABO003</th>\n",
" <td>13.0</td>\n",
" <td>15.0</td>\n",
" <td>15.0</td>\n",
" <td>5.0</td>\n",
" <td>17.0</td>\n",
" <td>17.0</td>\n",
" <td>12.0</td>\n",
" <td>12.0</td>\n",
" <td>11.0</td>\n",
" <td>4.0</td>\n",
" <td>...</td>\n",
" <td>255.0</td>\n",
" <td>19.0</td>\n",
" <td>704.0</td>\n",
" <td>14.0</td>\n",
" <td>103.0</td>\n",
" <td>689.0</td>\n",
" <td>17344.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2014COV001</th>\n",
" <td>15.0</td>\n",
" <td>30.0</td>\n",
" <td>23.0</td>\n",
" <td>19.0</td>\n",
" <td>18.0</td>\n",
" <td>21.0</td>\n",
" <td>11.0</td>\n",
" <td>20.0</td>\n",
" <td>9.0</td>\n",
" <td>23.0</td>\n",
" <td>...</td>\n",
" <td>223.0</td>\n",
" <td>29.0</td>\n",
" <td>678.0</td>\n",
" <td>18.0</td>\n",
" <td>98.0</td>\n",
" <td>640.0</td>\n",
" <td>1113.0</td>\n",
" <td>17974.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2014COV002</th>\n",
" <td>19.0</td>\n",
" <td>35.0</td>\n",
" <td>20.0</td>\n",
" <td>6.0</td>\n",
" <td>21.0</td>\n",
" <td>23.0</td>\n",
" <td>20.0</td>\n",
" <td>28.0</td>\n",
" <td>20.0</td>\n",
" <td>21.0</td>\n",
" <td>...</td>\n",
" <td>270.0</td>\n",
" <td>24.0</td>\n",
" <td>605.0</td>\n",
" <td>19.0</td>\n",
" <td>111.0</td>\n",
" <td>647.0</td>\n",
" <td>1147.0</td>\n",
" <td>776.0</td>\n",
" <td>18369.0</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2014COV006</th>\n",
" <td>15.0</td>\n",
" <td>15.0</td>\n",
" <td>15.0</td>\n",
" <td>10.0</td>\n",
" <td>22.0</td>\n",
" <td>24.0</td>\n",
" <td>16.0</td>\n",
" <td>12.0</td>\n",
" <td>11.0</td>\n",
" <td>17.0</td>\n",
" <td>...</td>\n",
" <td>236.0</td>\n",
" <td>26.0</td>\n",
" <td>720.0</td>\n",
" <td>24.0</td>\n",
" <td>109.0</td>\n",
" <td>710.0</td>\n",
" <td>897.0</td>\n",
" <td>997.0</td>\n",
" <td>1086.0</td>\n",
" <td>18524.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>77 rows × 77 columns</p>\n",
"</div>"
],
"text/plain": [
" 98SAOT041 98SAOT027 98SAOT018 98SAOT016 98SAOT002 98PRIN006 \\\n",
"98SAOT041 2104.0 NaN NaN NaN NaN NaN \n",
"98SAOT027 60.0 2005.0 NaN NaN NaN NaN \n",
"98SAOT018 282.0 87.0 2173.0 NaN NaN NaN \n",
"98SAOT016 111.0 92.0 145.0 1842.0 NaN NaN \n",
"98SAOT002 185.0 142.0 99.0 123.0 2031.0 NaN \n",
"... ... ... ... ... ... ... \n",
"12SELI0072 27.0 25.0 25.0 9.0 36.0 20.0 \n",
"2014ABO003 13.0 15.0 15.0 5.0 17.0 17.0 \n",
"2014COV001 15.0 30.0 23.0 19.0 18.0 21.0 \n",
"2014COV002 19.0 35.0 20.0 6.0 21.0 23.0 \n",
"2014COV006 15.0 15.0 15.0 10.0 22.0 24.0 \n",
"\n",
" 8017_D 98PRIN026 6617_P 6617_B ... 11TIKO391 98PRIN046 \\\n",
"98SAOT041 NaN NaN NaN NaN ... NaN NaN \n",
"98SAOT027 NaN NaN NaN NaN ... NaN NaN \n",
"98SAOT018 NaN NaN NaN NaN ... NaN NaN \n",
"98SAOT016 NaN NaN NaN NaN ... NaN NaN \n",
"98SAOT002 NaN NaN NaN NaN ... NaN NaN \n",
"... ... ... ... ... ... ... ... \n",
"12SELI0072 24.0 43.0 16.0 32.0 ... 295.0 36.0 \n",
"2014ABO003 12.0 12.0 11.0 4.0 ... 255.0 19.0 \n",
"2014COV001 11.0 20.0 9.0 23.0 ... 223.0 29.0 \n",
"2014COV002 20.0 28.0 20.0 21.0 ... 270.0 24.0 \n",
"2014COV006 16.0 12.0 11.0 17.0 ... 236.0 26.0 \n",
"\n",
" 12SELI0021 0117_J 11TIKO403 12SELI0072 2014ABO003 2014COV001 \\\n",
"98SAOT041 NaN NaN NaN NaN NaN NaN \n",
"98SAOT027 NaN NaN NaN NaN NaN NaN \n",
"98SAOT018 NaN NaN NaN NaN NaN NaN \n",
"98SAOT016 NaN NaN NaN NaN NaN NaN \n",
"98SAOT002 NaN NaN NaN NaN NaN NaN \n",
"... ... ... ... ... ... ... \n",
"12SELI0072 874.0 30.0 144.0 19488.0 NaN NaN \n",
"2014ABO003 704.0 14.0 103.0 689.0 17344.0 NaN \n",
"2014COV001 678.0 18.0 98.0 640.0 1113.0 17974.0 \n",
"2014COV002 605.0 19.0 111.0 647.0 1147.0 776.0 \n",
"2014COV006 720.0 24.0 109.0 710.0 897.0 997.0 \n",
"\n",
" 2014COV002 2014COV006 \n",
"98SAOT041 NaN NaN \n",
"98SAOT027 NaN NaN \n",
"98SAOT018 NaN NaN \n",
"98SAOT016 NaN NaN \n",
"98SAOT002 NaN NaN \n",
"... ... ... \n",
"12SELI0072 NaN NaN \n",
"2014ABO003 NaN NaN \n",
"2014COV001 NaN NaN \n",
"2014COV002 18369.0 NaN \n",
"2014COV006 1086.0 18524.0 \n",
"\n",
"[77 rows x 77 columns]"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# make a nice dataframe with the sample names as index and column headers\n",
"df = pd.DataFrame(out, index=samples, columns=samples)\n",
"df.to_csv('foo.csv', sep='\\t') # could do to_excel or many other options\n",
"\n",
"df"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.8"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment