Python Coding
Click on the drop down below to view code. Copy and paste into a compiler to use code.
Data Manipulation: Find Union and Intersection of Gene Sets
## Author: Catherine Calma
## Date: 9/15/21
## Description: User will input mode (?-get current directory,T-convert text to csv,C-compare two csv files)
## T: use this mode to convert .txt files to .csv files for easier comparison
## The output will be a user-named .csv file in the current working directory
## O: enter two file to compare. The output will be a user-named .csv file in the
## current working directory containing all overlap in two files
## ?: get working directory
## #: change working directory
## J : enter to .csv files to join. The ouput will be user-named .csv file with union
## of entered files
## C: convert a FBgn list to gene IDs. External web link
## Currently only works for when files with genes ONLY are entered and returns CSV with
## genes ONLY. C:\Users\cathe\Desktop\Brain VIP\Data Sets
import os
import webbrowser
def read_file(file_to_open):
#open text file and read into lines
gene_file = open(file_to_open, 'r')
lines = gene_file.readlines()
return lines, gene_file
def csv_to_list(lines, gene_file):
#open csv file and read into list
gene_list = []
for item in lines:
item = item.rstrip(',\n')
if (item != ''):
gene_list.append(item)
gene_file.close()
return gene_list
def create_list(lines, gene_file):
#create a list from lines of text
gene_list = []
for gene in lines:
gene = gene.rstrip('\n')
gene_list.append(gene)
gene_file.close()
return gene_list
def store_output(file_to_store, gene_list):
# write csv into output file
out_file = open(file_to_store, 'w')
for item in gene_list:
out_file.write(str(item) + ',\n')
out_file.close()
def compare(gene_list_one, gene_list_two):
#comparing two lists
shared_genes = set(gene_list_one) & set(gene_list_two)
shared_genes = list(shared_genes)
return shared_genes
def join(gene_list_one, gene_list_two):
#join two lists
joined_genes = set(gene_list_one).union(set(gene_list_two))
joined_genes = list(joined_genes)
return joined_genes
def main():
prompt = 'A'
while (prompt != 'Q'):
mode = input('What would you like to do? (?-get current directory, #-change\n'
'directory, T-convert text to csv, O-compare overlap two csv files, J-join two csv files\n'
'C-convert FBgn to Gene IDs)\n')
if mode == 'T':
file_to_open = input('Enter the name of a text file you want to open:\n')
file_to_store = input('Name the csv file you want to use for outputs:\n')
lines, gene_file = read_file(file_to_open)
gene_list = create_list(lines, gene_file)
store_output(file_to_store, gene_list)
print('Your data is now in ' + file_to_store +'.\n')
elif mode == 'O':
file_one = input('Enter name of first csv file:\n')
file_two = input('Enter name of second csv file:\n')
file_to_store = input('Name the csv file you want to use for outputs:\n')
lines_one, genes_one = read_file(file_one)
lines_two, genes_two = read_file(file_two)
gene_list_one = csv_to_list(lines_one, genes_one)
gene_list_two = csv_to_list(lines_two, genes_two)
shared_genes = compare(gene_list_one, gene_list_two)
store_output(file_to_store, shared_genes)
print('Overlap from ' + file_one + ' and ' + file_two + ' are now stored in ' + file_to_store + '.\n')
elif mode == 'J':
file_one = input('Enter name of first csv file:\n')
file_two = input('Enter name of second csv file:\n')
file_to_store = input('Name the csv file you want to use for outputs:\n')
lines_one, genes_one = read_file(file_one)
lines_two, genes_two = read_file(file_two)
gene_list_one = csv_to_list(lines_one, genes_one)
gene_list_two = csv_to_list(lines_two, genes_two)
joined_genes = join(gene_list_one, gene_list_two)
store_output(file_to_store, joined_genes)
print(file_one + ' and ' + file_two + ' are now joined in ' + file_to_store + '.\n')
elif mode == '?':
print('Your current working directory is ' + os.getcwd()) #give current working directory
elif mode == '#':
path = input('What directory to use? (C:\\...)\n')
os.chdir(path)
elif mode == 'C':
webbrowser.open("https://www.biotools.fr/drosophila/fbgn_converter")
else:
print('Invalid mode')
prompt = input('Would you like to continue? (hit enter or enter Q to quit)\n')
main()
Get Cut Offs for Cell Types Average
## Author: Catherine Calma
## Date: 9/15/21
##Last Update: 10/06/21
## Description: This program is made specifically for sorting our data set
## Cell_Types_Avg_Counts.csv or data sets of the same format.
## The program will prompt for the file name, which it will then
## read into a header line and data lines. It will then extract
## data of the same row placement to create lists of columns. The user
## will be prompted for a cut-off value. This is will be the lowest accepted
## value for gene expression. The program will iterate through each data column
## and return only gene IDs with expressions equal to or greater than the cut-off.
## The user then will be prompted to name an output file for each data column NOT
## including the first column, which is just gene IDs. In the case of
## Cell_Types_Avg_Counts.csv, the columns are as follows: MB-ab neuron, MB-gamma neuron,
## non-MB neuron, and neuroglia. Once the program has run, the you will find user-named
## output files in your current working directory.
##
## We decided to use a 140 as our cut off for gene expression
##
import os
#get header by reading first line and separating into a list
def get_header(gene_file):
header = gene_file.readline()
header = header.split(',')
return header
#open file, call header, and read remaining lines into a list
def read_file(file_to_open):
#open text file and read into lines
gene_file = open(file_to_open, 'r')
header = get_header(gene_file)
lines = gene_file.readlines()
gene_file.close()
return header, lines
#For each line, use respective placement to create column lists
def create_lists(lines):
col_1 = []
col_2 = []
col_3 = []
col_4 = []
col_5 = []
for item in lines:
item = item.rstrip('\n')
item = item.split(',')
col_1.append(item[0])
col_2.append(item[1])
col_3.append(item[2])
col_4.append(item[3])
col_5.append(item[4])
return col_1, col_2, col_3, col_4, col_5
#Get cut-off value, will return only genes with expression of
#cut-off or greater
def get_sorted_dicts(col_1, col_2, col_3, col_4, col_5):
cut_off = input("Select a cut off value:\n")
cut_off = int(cut_off)
#first column is just the gene names
gene_symbol = col_1
# {gene symbol: expression value} will be stored here
dict_1 = dict()
dict_2 = dict()
dict_3 = dict()
dict_4 = dict()
num_dict = 0 #counter for outer loop
total_data = [col_2, col_3, col_4, col_5] #list of lists of data values
sorted_total_data = [dict_1, dict_2, dict_3, dict_4] #to store sorted values
for i in total_data:
count = 0 #counter for inner loop
for j in i:
#if expression > 140, add gene to sort
count += 1
if float(j) >= cut_off:
sorted_total_data[num_dict].update({gene_symbol[count]:j})
num_dict += 1
return sorted_total_data
def results(header, col_num, data_column, out_file_name):
# write csv into output file
out_file = open(out_file_name, 'w')
out_file.write(str(header[col_num])+'\n') #print header
data_column = list(data_column.keys()) #print data from selected column
for item in data_column:
out_file.write(str(item) + ',\n')
out_file.close()
#should print the gene lists over 140
def main():
print('Your working directory is ' + os.getcwd())
change_dir = input('Would you like to set the working directory?(N for no)\n')
if change_dir != 'N':
path = (input('Choose working directory:(C:\\...)\n'))
os.chdir(path)
file_to_open = input('Enter the name of a text file you want to open:\n')
header, lines = read_file(file_to_open)
col_1, col_2, col_3, col_4, col_5 = create_lists(lines)
sorted_total_data = get_sorted_dicts(col_1, col_2, col_3, col_4, col_5)
file_to_store = input('Name the csv file you want to use for first data column:\n')
results(header, 1, sorted_total_data[0],file_to_store)
file_to_store = input('Name the csv file you want to use for second data column:\n')
results(header, 2, sorted_total_data[1],file_to_store)
file_to_store = input('Name the csv file you want to use for third data column:\n')
results(header, 3, sorted_total_data[2],file_to_store)
file_to_store = input('Name the csv file you want to use for fourth data column:\n')
results(header, 4, sorted_total_data[3],file_to_store)
main()