Python Coding

Click on the drop down below to view code. Copy and paste into a compiler to use code.

Data Manipulation: Find Union and Intersection of Gene Sets

## Author: Catherine Calma

## Date: 9/15/21

## Description: User will input mode (?-get current directory,T-convert text to csv,C-compare two csv files)

##              T: use this mode to convert .txt files to .csv files for easier comparison

##                 The output will be a user-named .csv file in the current working directory

##              O: enter two file to compare. The output will be a user-named .csv file in the

##                 current working directory containing all overlap in two files

##              ?: get working directory

##              #: change working directory

##              J : enter to .csv files to join. The ouput will be user-named .csv file with union

##                  of entered files

##              C: convert a FBgn list to gene IDs. External web link

##              Currently only works for when files with genes ONLY are entered and returns CSV with

##              genes ONLY. C:\Users\cathe\Desktop\Brain VIP\Data Sets


import os

import webbrowser



def read_file(file_to_open):

    #open text file and read into lines

    gene_file = open(file_to_open, 'r')

    lines = gene_file.readlines()

    return lines, gene_file


def csv_to_list(lines, gene_file):

    #open csv file and read into list

    gene_list = []

    for item in lines:

        item = item.rstrip(',\n')

        if (item != ''):

            gene_list.append(item)

    gene_file.close()

    return gene_list


def create_list(lines, gene_file):

    #create a list from lines of text

    gene_list = []

    for gene in lines:

        gene = gene.rstrip('\n')

        gene_list.append(gene)

    gene_file.close()

    return gene_list


def store_output(file_to_store, gene_list):

    # write csv into output file

    out_file = open(file_to_store, 'w')

    for item in gene_list:

        out_file.write(str(item) + ',\n')

    out_file.close()


def compare(gene_list_one, gene_list_two):

    #comparing two lists

    shared_genes = set(gene_list_one) & set(gene_list_two)

    shared_genes = list(shared_genes)

    return shared_genes


def join(gene_list_one, gene_list_two):

    #join two lists

    joined_genes = set(gene_list_one).union(set(gene_list_two))

    joined_genes = list(joined_genes)

    return joined_genes


def main():

    prompt = 'A'

    while (prompt != 'Q'):

        mode = input('What would you like to do? (?-get current directory, #-change\n'

        'directory, T-convert text to csv, O-compare overlap two csv files, J-join two csv files\n'

        'C-convert FBgn to Gene IDs)\n')

        if mode == 'T':

            file_to_open = input('Enter the name of a text file you want to open:\n')

            file_to_store = input('Name the csv file you want to use for outputs:\n')

            lines, gene_file = read_file(file_to_open)

            gene_list = create_list(lines, gene_file)

            store_output(file_to_store, gene_list)

            print('Your data is now in ' + file_to_store +'.\n')

        elif mode == 'O':

            file_one =  input('Enter name of first csv file:\n')

            file_two = input('Enter name of second csv file:\n')

            file_to_store = input('Name the csv file you want to use for outputs:\n')

            lines_one, genes_one = read_file(file_one)

            lines_two, genes_two = read_file(file_two)

            gene_list_one = csv_to_list(lines_one, genes_one)

            gene_list_two = csv_to_list(lines_two, genes_two)

            shared_genes = compare(gene_list_one, gene_list_two)

            store_output(file_to_store, shared_genes)

            print('Overlap from ' + file_one + ' and ' + file_two + ' are now stored in ' + file_to_store + '.\n')

        elif mode == 'J':

            file_one =  input('Enter name of first csv file:\n')

            file_two = input('Enter name of second csv file:\n')

            file_to_store = input('Name the csv file you want to use for outputs:\n')

            lines_one, genes_one = read_file(file_one)

            lines_two, genes_two = read_file(file_two)

            gene_list_one = csv_to_list(lines_one, genes_one)

            gene_list_two = csv_to_list(lines_two, genes_two)

            joined_genes = join(gene_list_one, gene_list_two)

            store_output(file_to_store, joined_genes)

            print(file_one + ' and ' + file_two + ' are now joined in ' + file_to_store + '.\n')

        elif mode == '?':

            print('Your current working directory is ' + os.getcwd()) #give current working directory

        elif mode == '#':

            path = input('What directory to use? (C:\\...)\n')

            os.chdir(path)

        elif mode == 'C':

            webbrowser.open("https://www.biotools.fr/drosophila/fbgn_converter")

        else:

            print('Invalid mode')

        prompt = input('Would you like to continue? (hit enter or enter Q to quit)\n')


main()


Get Cut Offs for Cell Types Average

## Author: Catherine Calma

## Date: 9/15/21

##Last Update: 10/06/21

## Description: This program is made specifically for sorting our data set

##              Cell_Types_Avg_Counts.csv or data sets of the same format.

##              The program will prompt for the file name, which it will then

##              read into a header line and data lines. It will then extract

##              data of the same row placement to create lists of columns. The user

##              will be prompted for a cut-off value. This is will be the lowest accepted

##              value for gene expression. The program will iterate through each data column

##              and return only gene IDs with expressions equal to or greater than the cut-off.

##              The user then will be prompted to name an output file for each data column NOT

##              including the first column, which is just gene IDs. In the case of

##              Cell_Types_Avg_Counts.csv, the columns are as follows: MB-ab neuron, MB-gamma neuron,

##              non-MB neuron, and neuroglia. Once the program has run, the you will find user-named

##              output files in your current working directory.

##

## We decided to use a 140 as our cut off for gene expression

##


import os

#get header by reading first line and separating into a list

def get_header(gene_file):

    header = gene_file.readline()

    header = header.split(',')

    return header


#open file, call header, and read remaining lines into a list

def read_file(file_to_open):

    #open text file and read into lines

    gene_file = open(file_to_open, 'r')

    header = get_header(gene_file)

    lines = gene_file.readlines()

    gene_file.close()

    return header, lines



#For each line, use respective placement to create column lists

def create_lists(lines):

    col_1 = []

    col_2 = []

    col_3 = []

    col_4 = []

    col_5 = []

    for item in lines:

        item = item.rstrip('\n')

        item = item.split(',')

        col_1.append(item[0])

        col_2.append(item[1])

        col_3.append(item[2])

        col_4.append(item[3])

        col_5.append(item[4])

    return col_1, col_2, col_3, col_4, col_5


#Get cut-off value, will return only genes with expression of

#cut-off or greater

def get_sorted_dicts(col_1, col_2, col_3, col_4, col_5):

    cut_off = input("Select a cut off value:\n")

    cut_off = int(cut_off)

    #first column is just the gene names

    gene_symbol = col_1

    # {gene symbol: expression value} will be stored here

    dict_1 = dict()

    dict_2 = dict()

    dict_3 = dict()

    dict_4 = dict()

    num_dict = 0  #counter for outer loop

    total_data = [col_2, col_3, col_4, col_5] #list of lists of data values

    sorted_total_data = [dict_1, dict_2, dict_3, dict_4] #to store sorted values

    for i in total_data:

        count = 0 #counter for inner loop

        for j in i:

        #if expression > 140, add gene to sort

            count += 1

            if float(j) >= cut_off:

                sorted_total_data[num_dict].update({gene_symbol[count]:j})

        num_dict += 1

    return sorted_total_data


def results(header, col_num, data_column, out_file_name):

    # write csv into output file

    out_file = open(out_file_name, 'w')

    out_file.write(str(header[col_num])+'\n') #print header

    data_column = list(data_column.keys())    #print data from selected column

    for item in data_column:

        out_file.write(str(item) + ',\n')

    out_file.close()

    #should print the gene lists over 140


def main():

        print('Your working directory is ' + os.getcwd())

        change_dir = input('Would you like to set the working directory?(N for no)\n')

        if change_dir != 'N':

            path = (input('Choose working directory:(C:\\...)\n'))

            os.chdir(path)

        file_to_open = input('Enter the name of a text file you want to open:\n')

        header, lines = read_file(file_to_open)

        col_1, col_2, col_3, col_4, col_5 = create_lists(lines)

        sorted_total_data  = get_sorted_dicts(col_1, col_2, col_3, col_4, col_5)

        file_to_store = input('Name the csv file you want to use for first data column:\n')

        results(header, 1, sorted_total_data[0],file_to_store)

        file_to_store = input('Name the csv file you want to use for second data column:\n')

        results(header, 2, sorted_total_data[1],file_to_store)

        file_to_store = input('Name the csv file you want to use for third data column:\n')

        results(header, 3, sorted_total_data[2],file_to_store)

        file_to_store = input('Name the csv file you want to use for fourth data column:\n')

        results(header, 4, sorted_total_data[3],file_to_store)


main()