Basic modification of Biological data frames using Pandas by Shelley Klompus
Shelley Klompus a Phd Student at the Weizmann Institue of Science was one of my students in the Programming Bootcamp for Scientists.
In this video she discribes her project.
The code:
examples/shelley_klompus_final_project.py
# in this project I would like to merge 2 csv files as pandas dataframes, one has information on the reactivity of\ # antigen in our assay and the other one has the information on the biological identity of the antigen # I will keep only 2 types of antigens: positive controls - bacterial fllagella and negative controls- random peptides\ # I will compare the reactivity of the two groups and create 2 output files: boxplot, and basic statistics of the 2 groups import pandas as pd import numpy as np import matplotlib as mpl import matplotlib.pyplot as plt import os import io import sys import time startTime = time.time() def parse_column(row): return str(row['uniref_func']).split(" n=")[0] def antigen_type(row): if "random" in row["full name"]: return "negative control" elif "flagell" in row["parsed_func"]: return "flagella" elif "flagell" in row["full name"]: return "flagella" else: return "" if len(sys.argv) < 2: exit("Save the two input files under the names: library_prec_passed.csv and library_uniref_funk.csv\ and provide only the path to the directory") #path_of_dir = '/net/mraid08/export/genie/Lab/Personal/shelley/courses/python_bootcamp/final_project' path_of_dir = sys.argv[1] file_input_1= 'library_prec_passed.csv' file_input_2= 'library_uniref_funk.csv' file_input_1_path = os.path.join(path_of_dir,file_input_1) file_input_2_path = os.path.join(path_of_dir,file_input_2) if not(os.path.exists(file_input_1_path or file_input_2_path)): exit("Save the two input files under the names: library_prec_passed.csv and library_uniref_funk.csv\ and provide only the path to the directory") # step 1- load the data and merge the dataframes and fill the NA with 0 df_reac = pd.read_csv(file_input_1_path,index_col="antigen_num") df_func = pd.read_csv(file_input_2_path,index_col="antigen_num") df_RF=pd.concat([df_reac,df_func ],axis=1) df_RF['perc_passed'] = df_RF['perc_passed'].fillna(0) # step 2- create a dictionary to parse the uniref_func column and merge it back to the main dataframe df_RF['parsed_func'] = df_RF.apply(parse_column, axis=1) # step 3- select the 2 groups: fllagella agtigens and negative controls #df_RF.reset_index (level =0, inplace = True) df_RF['type_of_antigen'] = df_RF.apply(antigen_type, axis=1) #step 4- create the datafram for the output files: boxplot and basic statistics df_toplot =df_RF.loc[df_RF["type_of_antigen"] != ""] boxplot = df_toplot.boxplot(by = "type_of_antigen", column=["perc_passed"], grid=False) plt.ylabel("% of reactivity") plt.xlabel("type of antigen") plt.title("reactivity against bacterial flagella antigens") plt.suptitle("") #plt.show() fig_file_name = 'reactivity against bacterial flagella antigens.png' stat_file_name = 'reactivity against bacterial flagella antigens statistics.csv' df_file_name = 'flagella and random oligos df.csv' plt.savefig(os.path.join(path_of_dir,fig_file_name)) stat = df_toplot.groupby('type_of_antigen').describe().unstack(1) #print(stat) stat.to_csv(os.path.join(path_of_dir,stat_file_name)) df_toplot.to_csv(os.path.join(path_of_dir,df_file_name)) executionTime = (time.time() - startTime) print("Project is done,execution time in seconds: " + str(executionTime))
Published on 2020-11-26
If you have any comments or questions, feel free to post them on the source of this page in GitHub. Source on GitHub.
Comment on this post