Python Coding

Click on the drop down below to view code. Copy and paste into a compiler to use code.

Data Manipulation: Find Union and Intersection of Gene Sets

## Author: Catherine Calma

## Date: 9/15/21

## Description: User will input mode (?-get current directory,T-convert text to csv,C-compare two csv files)

## T: use this mode to convert .txt files to .csv files for easier comparison

## The output will be a user-named .csv file in the current working directory

## O: enter two file to compare. The output will be a user-named .csv file in the

## current working directory containing all overlap in two files

## ?: get working directory

## #: change working directory

## J : enter to .csv files to join. The ouput will be user-named .csv file with union

## of entered files

## C: convert a FBgn list to gene IDs. External web link

## Currently only works for when files with genes ONLY are entered and returns CSV with

## genes ONLY. C:\Users\cathe\Desktop\Brain VIP\Data Sets

import os

import webbrowser

def read_file(file_to_open):

#open text file and read into lines

gene_file = open(file_to_open, 'r')

lines = gene_file.readlines()

return lines, gene_file

def csv_to_list(lines, gene_file):

#open csv file and read into list

gene_list = []

for item in lines:

item = item.rstrip(',\n')

if (item != ''):

gene_list.append(item)

gene_file.close()

return gene_list

def create_list(lines, gene_file):

#create a list from lines of text

gene_list = []

for gene in lines:

gene = gene.rstrip('\n')

gene_list.append(gene)

gene_file.close()

return gene_list

def store_output(file_to_store, gene_list):

# write csv into output file

out_file = open(file_to_store, 'w')

for item in gene_list:

out_file.write(str(item) + ',\n')

out_file.close()

def compare(gene_list_one, gene_list_two):

#comparing two lists

shared_genes = set(gene_list_one) & set(gene_list_two)

shared_genes = list(shared_genes)

return shared_genes

def join(gene_list_one, gene_list_two):

#join two lists

joined_genes = set(gene_list_one).union(set(gene_list_two))

joined_genes = list(joined_genes)

return joined_genes

def main():

prompt = 'A'

while (prompt != 'Q'):

mode = input('What would you like to do? (?-get current directory, #-change\n'

'directory, T-convert text to csv, O-compare overlap two csv files, J-join two csv files\n'

'C-convert FBgn to Gene IDs)\n')

if mode == 'T':

file_to_open = input('Enter the name of a text file you want to open:\n')

file_to_store = input('Name the csv file you want to use for outputs:\n')

lines, gene_file = read_file(file_to_open)

gene_list = create_list(lines, gene_file)

store_output(file_to_store, gene_list)

print('Your data is now in ' + file_to_store +'.\n')

elif mode == 'O':

file_one = input('Enter name of first csv file:\n')

file_two = input('Enter name of second csv file:\n')

file_to_store = input('Name the csv file you want to use for outputs:\n')

lines_one, genes_one = read_file(file_one)

lines_two, genes_two = read_file(file_two)

gene_list_one = csv_to_list(lines_one, genes_one)

gene_list_two = csv_to_list(lines_two, genes_two)

shared_genes = compare(gene_list_one, gene_list_two)

store_output(file_to_store, shared_genes)

print('Overlap from ' + file_one + ' and ' + file_two + ' are now stored in ' + file_to_store + '.\n')

elif mode == 'J':

file_one = input('Enter name of first csv file:\n')

file_two = input('Enter name of second csv file:\n')

file_to_store = input('Name the csv file you want to use for outputs:\n')

lines_one, genes_one = read_file(file_one)

lines_two, genes_two = read_file(file_two)

gene_list_one = csv_to_list(lines_one, genes_one)

gene_list_two = csv_to_list(lines_two, genes_two)

joined_genes = join(gene_list_one, gene_list_two)

store_output(file_to_store, joined_genes)

print(file_one + ' and ' + file_two + ' are now joined in ' + file_to_store + '.\n')

elif mode == '?':

print('Your current working directory is ' + os.getcwd()) #give current working directory

elif mode == '#':

path = input('What directory to use? (C:\\...)\n')

os.chdir(path)

elif mode == 'C':

webbrowser.open("https://www.biotools.fr/drosophila/fbgn_converter")

else:

print('Invalid mode')

prompt = input('Would you like to continue? (hit enter or enter Q to quit)\n')

main()

Get Cut Offs for Cell Types Average

## Author: Catherine Calma

## Date: 9/15/21

##Last Update: 10/06/21

## Description: This program is made specifically for sorting our data set

## Cell_Types_Avg_Counts.csv or data sets of the same format.

## The program will prompt for the file name, which it will then

## read into a header line and data lines. It will then extract

## data of the same row placement to create lists of columns. The user

## will be prompted for a cut-off value. This is will be the lowest accepted

## value for gene expression. The program will iterate through each data column

## and return only gene IDs with expressions equal to or greater than the cut-off.

## The user then will be prompted to name an output file for each data column NOT

## including the first column, which is just gene IDs. In the case of

## Cell_Types_Avg_Counts.csv, the columns are as follows: MB-ab neuron, MB-gamma neuron,

## non-MB neuron, and neuroglia. Once the program has run, the you will find user-named

## output files in your current working directory.

##

## We decided to use a 140 as our cut off for gene expression

##

import os

#get header by reading first line and separating into a list

def get_header(gene_file):

header = gene_file.readline()

header = header.split(',')

return header

#open file, call header, and read remaining lines into a list

def read_file(file_to_open):

#open text file and read into lines

gene_file = open(file_to_open, 'r')

header = get_header(gene_file)

lines = gene_file.readlines()

gene_file.close()

return header, lines

#For each line, use respective placement to create column lists

def create_lists(lines):

col_1 = []

col_2 = []

col_3 = []

col_4 = []

col_5 = []

for item in lines:

item = item.rstrip('\n')

item = item.split(',')

col_1.append(item[0])

col_2.append(item[1])

col_3.append(item[2])

col_4.append(item[3])

col_5.append(item[4])

return col_1, col_2, col_3, col_4, col_5

#Get cut-off value, will return only genes with expression of

#cut-off or greater

def get_sorted_dicts(col_1, col_2, col_3, col_4, col_5):

cut_off = input("Select a cut off value:\n")

cut_off = int(cut_off)

#first column is just the gene names

gene_symbol = col_1

# {gene symbol: expression value} will be stored here

dict_1 = dict()

dict_2 = dict()

dict_3 = dict()

dict_4 = dict()

num_dict = 0 #counter for outer loop

total_data = [col_2, col_3, col_4, col_5] #list of lists of data values

sorted_total_data = [dict_1, dict_2, dict_3, dict_4] #to store sorted values

for i in total_data:

count = 0 #counter for inner loop

for j in i:

#if expression > 140, add gene to sort

count += 1

if float(j) >= cut_off:

sorted_total_data[num_dict].update({gene_symbol[count]:j})

num_dict += 1

return sorted_total_data

def results(header, col_num, data_column, out_file_name):

# write csv into output file

out_file = open(out_file_name, 'w')

out_file.write(str(header[col_num])+'\n') #print header

data_column = list(data_column.keys()) #print data from selected column

for item in data_column:

out_file.write(str(item) + ',\n')

out_file.close()

#should print the gene lists over 140

def main():

print('Your working directory is ' + os.getcwd())

change_dir = input('Would you like to set the working directory?(N for no)\n')

if change_dir != 'N':

path = (input('Choose working directory:(C:\\...)\n'))

os.chdir(path)

file_to_open = input('Enter the name of a text file you want to open:\n')

header, lines = read_file(file_to_open)

col_1, col_2, col_3, col_4, col_5 = create_lists(lines)

sorted_total_data = get_sorted_dicts(col_1, col_2, col_3, col_4, col_5)

file_to_store = input('Name the csv file you want to use for first data column:\n')

results(header, 1, sorted_total_data[0],file_to_store)

file_to_store = input('Name the csv file you want to use for second data column:\n')

results(header, 2, sorted_total_data[1],file_to_store)

file_to_store = input('Name the csv file you want to use for third data column:\n')

results(header, 3, sorted_total_data[2],file_to_store)

file_to_store = input('Name the csv file you want to use for fourth data column:\n')

results(header, 4, sorted_total_data[3],file_to_store)

main()