Post date: Dec 05, 2019 12:34:23 AM
print column number
awk '{print NF}' file | sort -nu | tail -n 1
New code to filter the common variants in the filtered vcf files, in Python:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
This script reads in the vcf file and returns a matrix of genotypes.
Revert the genotype likelihoods from Phred scale to probabilities.
Proba are between 0 and 1.
Sum all three probabilities to get the sum (which will be more than one) and normalize the obtained proba by this number.
Then apply the filters:
If the proba of heteroZ is >0.99 then call it an heterozygote and code it with a 1.
If the proba of heteroZ is <0.01 then call it an homozygote and assign a 0.
If the proba of heteroZ is between 0.01 and 0.99, assign "NA".
Then calculate proportion of heteroZ for each allele and make the graph of number of number of heteroZ kept as a function of the cut.
"""
# Part 1 - transform Phred scores into probabilities
# Modules needed
import re as regex
import argparse
import os
# import math # to calculate log functions
# Arguments to be read
parser=argparse.ArgumentParser()
parser.add_argument("-i","--inputfile", required=True, help="Input vcf file.")
#parser.add_argument("-o","--outputfile", required=True, help="Output file name, probas.")
#parser.add_argument("-o2","--outputfile2", required=True, help="Output file name 2, heteroZ labels.")
# Read arguments
args = parser.parse_args() # Store arguments in parse
# Assign argument to variable name
input_file = args.inputfile
#output_file = args.outputfile
#output_file2 = args.outputfile2
# Loop over different thresholds for heteroZ
thresh = [0.01,0.02,0.03,0.04,0.05,0.06,0.07,0.08,0.09,0.1]
for t in thresh:
# Open the file, read line by line
output_file_name_1 = "proba_{}.txt".format(t)
final = open(os.path.join('./pando_only_variants/',output_file_name_1),"w+")
output_file_name_2 = "heteroZ_label_{}.txt".format(t)
heteroZ = open(os.path.join('./pando_only_variants/',output_file_name_2),"w+")
with open(input_file) as file:
# Go through the file line by line
for line in file:
# if the line start with a scaffolfd name, in this case "Potrs" for Pop. tremuloides
if not line.startswith('Potrs'):
pass
else:
# Match the genotype likelihood (gl) in the vcf file
gl = regex.findall(r':\d+,\d+,\d+:', line)
# Loop over the gl, three items each time for homoZ reference, hetroZ, homoZ alternative
pre_final = []
for ind in gl:
ind = ind.split(',')
temp = []
for i in ind:
i = regex.sub(':','',i)
i = int(i)
# Revert the Phred score to proba
proba = 10**(i/(-10))
temp.append(proba)
sum_proba = sum(temp)
for j in temp:
pre_final.append(j/sum_proba)
# This loops keeps all genotype likelihhod Phred scores converted back to probabilities
for item in pre_final:
final.write(f'{item}\t')
final.write('\n')
proba_het = pre_final[1::3]
# This second loop only looks at the second element every three elements and decide according to t if the ind is heteR
for proba in proba_het:
proba = float(proba) # from string to float
if proba >= (1-t):
# The individual is heterozygote -- coded with 1
heteroZ.write(f'1\t')
elif proba <= t:
# The individual is homozygote -- coded with 0
heteroZ.write(f'0\t')
# We don't know
elif t < proba < (1-t):
heteroZ.write(f'2\t')# -- coded with 2
heteroZ.write('\n')