Filter heterozygotes to keep SNPs with >80% hets

Post date: Jan 13, 2020 10:42:28 AM

#!/usr/bin/env python3

# -*- coding: utf-8 -*-

"""

Created on Mon Dec 9 17:27:15 2019

@author: Papaya

"""

#!/usr/bin/env python3

# -*- coding: utf-8 -*-

"""

Calls in the function that revert Phred scores to probabilities and assign genotypes (homoZ, heteroZ)

Calculates the proportion of hets.

Boolean vector with 1s for the lines to keep that have a proportion of hets inferior to the chosen theshold.

"""

def find_lines(t, output_vec, T, n):

print("Creating vector.")

# Choose the threshold to separate true homozygotes and true hets from the rest

#n # number of individuals expected in the file

t = str(t) # file name has the threshold as a string.

# read in the file

filename = "heteroZ_label_{}.txt".format(t)

final = open(output_vec, "w+")

file = open(filename, "r")

for line in file:

line = line.strip()

store = line.split()

# Control here for the good length

if len(store) == n:

hets = 0 # Records the number of hets

homs = 0 # Records the number of homs

NA = 0 # Record non-assigned genotypes

for item in store:

if item == "1":

hets += 1

elif item == "0":

homs += 1

elif item == "2":

NA += 1

if NA != n:

prop = hets/(hets+homs)

if prop >= T: # exclusive: everything bigger or equal to T is not kept

final.write("0\n") # We do not keep true hets, they receive a 0

elif prop < T:

final.write("1\n") # We keep every hets that has a proportion less than 80% per SNP

else:

print("Proportion does not make sense.")

print(hets,homs)

break

elif NA == n:

print("Only NA's in this line.")

final.write("0\n")

else:

print("The line has too many individuals.")

print(len(line))

break

"""

Read the original vcf file and create a dictionary with the scaffold name, mutation start and stop, and nucleotide change

"""

# Reference

def create_dict_1(input_file, vcf_file, output_name):

print("Creating dictionary.")

# Open the vcl file and keep the lines we are interested in from the vector created above

final = open(output_name, "w+")

vec = open(input_file, "r")

file = open(vcf_file, "r")

my_dict = {}

for line in file:

if not line.startswith('Potrs'):

pass

else:

decision = vec.readline()

decision = decision.rstrip()

if decision == '1' :

line = line.rstrip()

line = line.split()

if line[0] not in my_dict:

my_dict[line[0]] = {}

my_dict[line[0]][line[1]] = 1

else:

if my_dict[line[0]] != line[1]:

my_dict[line[0]][line[1]] = 1

else:

my_dict[line[0]][line[1]] += 1

elif decision == '0' :

pass

else:

print("Problem with input vector.")

for key1 in my_dict:

for key2 in my_dict[key1]:

final.write(f'{key1}\t{key2}\t{my_dict[key1][key2]}\n')

"""

Compare the created dictionary from Pando-only samples with variants found in Pando friends and

in the panel of normals

"""

# Comparisons

def create_dict_2(dictionary, vcf_file, output_name):

print("Creating dictionary 2.")

# Open the vcl file and keep the lines we are interested in from the vector created above

final = open(output_name, "w+")

my_dict = open(dictionary, "r")

file = open(vcf_file, "r")

my_dict = {}

for line in file:

if not line.startswith('Potrs'):

pass

else:

line = line.rstrip()

line = line.split()

if line[0] not in my_dict:

my_dict[line[0]] = {}

my_dict[line[0]][line[1]] = 1

else:

if my_dict[line[0]] != line[1]:

my_dict[line[0]][line[1]] = 1

else:

my_dict[line[0]][line[1]] += 1

for key1 in my_dict:

for key2 in my_dict[key1]:

final.write(f'{key1}\t{key2}\t{my_dict[key1][key2]}\n')

# Compare the files

def compare_dict(dict_1, dict_2, output):

with open(dict_1, 'r') as file1:

with open(dict_2, 'r') as file2:

same = set(file1).intersection(file2)

same.discard('\n')

with open(output, 'w') as file_out:

for line in same:

file_out.write(line)

# Call the functions here

find_lines(0.06, "bool_vec_pando.txt", .5, 109)

create_dict_1("bool_vec_pando.txt", "filtered2xHiCov_pando_only_variants.vcf", "pando.dict")

create_dict_2("pando.dict", "../pando_friends_variants/filtered2xHiCov_pando_friends_variants.vcf", "friends.dict")

compare_dict("pando.dict", "friends.dict", "common_pando_friends.dict")