Post date: Jan 13, 2020 10:42:28 AM
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Mon Dec 9 17:27:15 2019
@author: Papaya
"""
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Calls in the function that revert Phred scores to probabilities and assign genotypes (homoZ, heteroZ)
Calculates the proportion of hets.
Boolean vector with 1s for the lines to keep that have a proportion of hets inferior to the chosen theshold.
"""
def find_lines(t, output_vec, T, n):
print("Creating vector.")
# Choose the threshold to separate true homozygotes and true hets from the rest
#n # number of individuals expected in the file
t = str(t) # file name has the threshold as a string.
# read in the file
filename = "heteroZ_label_{}.txt".format(t)
final = open(output_vec, "w+")
file = open(filename, "r")
for line in file:
line = line.strip()
store = line.split()
# Control here for the good length
if len(store) == n:
hets = 0 # Records the number of hets
homs = 0 # Records the number of homs
NA = 0 # Record non-assigned genotypes
for item in store:
if item == "1":
hets += 1
elif item == "0":
homs += 1
elif item == "2":
NA += 1
if NA != n:
prop = hets/(hets+homs)
if prop >= T: # exclusive: everything bigger or equal to T is not kept
final.write("0\n") # We do not keep true hets, they receive a 0
elif prop < T:
final.write("1\n") # We keep every hets that has a proportion less than 80% per SNP
else:
print("Proportion does not make sense.")
print(hets,homs)
break
elif NA == n:
print("Only NA's in this line.")
final.write("0\n")
else:
print("The line has too many individuals.")
print(len(line))
break
"""
Read the original vcf file and create a dictionary with the scaffold name, mutation start and stop, and nucleotide change
"""
# Reference
def create_dict_1(input_file, vcf_file, output_name):
print("Creating dictionary.")
# Open the vcl file and keep the lines we are interested in from the vector created above
final = open(output_name, "w+")
vec = open(input_file, "r")
file = open(vcf_file, "r")
my_dict = {}
for line in file:
if not line.startswith('Potrs'):
pass
else:
decision = vec.readline()
decision = decision.rstrip()
if decision == '1' :
line = line.rstrip()
line = line.split()
if line[0] not in my_dict:
my_dict[line[0]] = {}
my_dict[line[0]][line[1]] = 1
else:
if my_dict[line[0]] != line[1]:
my_dict[line[0]][line[1]] = 1
else:
my_dict[line[0]][line[1]] += 1
elif decision == '0' :
pass
else:
print("Problem with input vector.")
for key1 in my_dict:
for key2 in my_dict[key1]:
final.write(f'{key1}\t{key2}\t{my_dict[key1][key2]}\n')
"""
Compare the created dictionary from Pando-only samples with variants found in Pando friends and
in the panel of normals
"""
# Comparisons
def create_dict_2(dictionary, vcf_file, output_name):
print("Creating dictionary 2.")
# Open the vcl file and keep the lines we are interested in from the vector created above
final = open(output_name, "w+")
my_dict = open(dictionary, "r")
file = open(vcf_file, "r")
my_dict = {}
for line in file:
if not line.startswith('Potrs'):
pass
else:
line = line.rstrip()
line = line.split()
if line[0] not in my_dict:
my_dict[line[0]] = {}
my_dict[line[0]][line[1]] = 1
else:
if my_dict[line[0]] != line[1]:
my_dict[line[0]][line[1]] = 1
else:
my_dict[line[0]][line[1]] += 1
for key1 in my_dict:
for key2 in my_dict[key1]:
final.write(f'{key1}\t{key2}\t{my_dict[key1][key2]}\n')
# Compare the files
def compare_dict(dict_1, dict_2, output):
with open(dict_1, 'r') as file1:
with open(dict_2, 'r') as file2:
same = set(file1).intersection(file2)
same.discard('\n')
with open(output, 'w') as file_out:
for line in same:
file_out.write(line)
# Call the functions here
find_lines(0.06, "bool_vec_pando.txt", .5, 109)
create_dict_1("bool_vec_pando.txt", "filtered2xHiCov_pando_only_variants.vcf", "pando.dict")
create_dict_2("pando.dict", "../pando_friends_variants/filtered2xHiCov_pando_friends_variants.vcf", "friends.dict")
compare_dict("pando.dict", "friends.dict", "common_pando_friends.dict")