2019-12-04

Post date: Dec 05, 2019 12:34:23 AM

print column number

awk '{print NF}' file | sort -nu | tail -n 1

New code to filter the common variants in the filtered vcf files, in Python:

#!/usr/bin/env python3

# -*- coding: utf-8 -*-

"""

This script reads in the vcf file and returns a matrix of genotypes.

Revert the genotype likelihoods from Phred scale to probabilities.

Proba are between 0 and 1.

Sum all three probabilities to get the sum (which will be more than one) and normalize the obtained proba by this number.

Then apply the filters:

If the proba of heteroZ is >0.99 then call it an heterozygote and code it with a 1.

If the proba of heteroZ is <0.01 then call it an homozygote and assign a 0.

If the proba of heteroZ is between 0.01 and 0.99, assign "NA".

Then calculate proportion of heteroZ for each allele and make the graph of number of number of heteroZ kept as a function of the cut.

"""

# Part 1 - transform Phred scores into probabilities

# Modules needed

import re as regex

import argparse

import os

# import math # to calculate log functions

# Arguments to be read

parser=argparse.ArgumentParser()

parser.add_argument("-i","--inputfile", required=True, help="Input vcf file.")

#parser.add_argument("-o","--outputfile", required=True, help="Output file name, probas.")

#parser.add_argument("-o2","--outputfile2", required=True, help="Output file name 2, heteroZ labels.")

# Read arguments

args = parser.parse_args() # Store arguments in parse

# Assign argument to variable name

input_file = args.inputfile

#output_file = args.outputfile

#output_file2 = args.outputfile2

# Loop over different thresholds for heteroZ

thresh = [0.01,0.02,0.03,0.04,0.05,0.06,0.07,0.08,0.09,0.1]

for t in thresh:

# Open the file, read line by line

output_file_name_1 = "proba_{}.txt".format(t)

final = open(os.path.join('./pando_only_variants/',output_file_name_1),"w+")

output_file_name_2 = "heteroZ_label_{}.txt".format(t)

heteroZ = open(os.path.join('./pando_only_variants/',output_file_name_2),"w+")

with open(input_file) as file:

# Go through the file line by line

for line in file:

# if the line start with a scaffolfd name, in this case "Potrs" for Pop. tremuloides

if not line.startswith('Potrs'):

pass

else:

# Match the genotype likelihood (gl) in the vcf file

gl = regex.findall(r':\d+,\d+,\d+:', line)

# Loop over the gl, three items each time for homoZ reference, hetroZ, homoZ alternative

pre_final = []

for ind in gl:

ind = ind.split(',')

temp = []

for i in ind:

i = regex.sub(':','',i)

i = int(i)

# Revert the Phred score to proba

proba = 10**(i/(-10))

temp.append(proba)

sum_proba = sum(temp)

for j in temp:

pre_final.append(j/sum_proba)

# This loops keeps all genotype likelihhod Phred scores converted back to probabilities

for item in pre_final:

final.write(f'{item}\t')

final.write('\n')

proba_het = pre_final[1::3]

# This second loop only looks at the second element every three elements and decide according to t if the ind is heteR

for proba in proba_het:

proba = float(proba) # from string to float

if proba >= (1-t):

# The individual is heterozygote -- coded with 1

heteroZ.write(f'1\t')

elif proba <= t:

# The individual is homozygote -- coded with 0

heteroZ.write(f'0\t')

# We don't know

elif t < proba < (1-t):

heteroZ.write(f'2\t')# -- coded with 2

heteroZ.write('\n')