TODO: Stack Overflow example

examples/pandas/stack_overflow_pandas.py

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

file = './developer_survey_2020/survey_results_public.csv'

df = pd.read_csv(file, sep = ',')

totals = []
countries = []
for c in np.unique(list(df['Country'])):

    totals.append(len(df[df['Country'] == c]))
    countries.append(c)

plt.figure()
plt.bar(np.arange(0, len(countries)), totals)
plt.xticks(np.arange(0, len(countries)), countries)



#%%

age_totals = []
ages = []
for c in np.unique(list(df['Age1stCode'])):

    age_totals.append(len(df[df['Age1stCode'] == c]))
    ages.append(c)

plt.figure()
plt.scatter(ages, age_totals)
plt.xticks(fontsize = 5)

examples/pandas/pandas_stackoverflow.py

import pandas

df=pandas.read_csv("survey_results_public.csv")

countrey_dist=df['Country'].value_counts()

open_sourcers_dist=df['OpenSourcer'].value_counts()

print("top 10 response countries:\n",countrey_dist.head(10))

open_sourcers_dist_top=df['OpenSourcer'][df['Country']=='United States'].value_counts()
experience_dist_top=df['YearsCode'][df['Country']=='United States'].value_counts()

print('distribution of open sourcer is the top country USA is :\n' ,open_sourcers_dist_top)
print('distribution of  experience in the top country USA is:\n' ,experience_dist_top)


df=df[['OpenSourcer','YearsCode']][df['Country']=='United States']
df_agg=df.groupby('OpenSourcer').agg('YearsCode')

print("relationship betwen OpenSourcer to coding exprience in US is :\n",df_agg)

examples/pandas/orig.py

import sys
import pandas as pd

filename = "survey_results_public.csv"
if len(sys.argv) == 2:
    filename = sys.argv[1]
country_name = 'Israel'
chunks = []
dev_chunks=[]
for chunk in pd.read_csv(filename, usecols=['Country','DevType'],chunksize=10000):
    part = chunk[chunk['Country'] == country_name]


    print(chunk.size)
    print(part.size)
    print('--')
    chunks.append(part)


df = pd.concat(chunks)
print(df.dtypes)
for value in ['Academic researcher','Data or business analyst', 'Data scientist or machine learning specialist','Database administrator','Designer', 'Developer, back-end',
              'Developer, desktop or enterprise applications','Developer, embedded applications or devices','Developer, front-end','Developer, full-stack','Developer, game or graphics', 'Developer, mobile','Developer, QA or test',
              'DevOps specialist','Educator','Engineer, data', 'Engineer, site reliability','Engineering manager', 'Marketing or sales professional', 'Product manager', 'Scientist',
              'Senior Executive (C-Suite, VP, etc.)', 'System administrator']:
    print(value)
    df[value]= df.apply(lambda row: value in row['DevType'], axis=1)

print(df.count())
print(df.size)

examples/pandas/panda_file.py

import sys
import pandas as pd

filename = "survey_results_public.csv"
if len(sys.argv) == 2:
    filename = sys.argv[1]
country_name = 'Israel'
chunks = []
dev_chunks=[]
for chunk in pd.read_csv(filename, usecols=['Country','DevType'],chunksize=10000):
    part = chunk[chunk['Country'] == country_name]

#df = pd.read_csv(filename, usecols=['Country','DevType'])
#,chunksize=10000):
#for chunk in pd.read_csv(filename, usecols=['Country','DevType'],chunksize=10000):
#    part = chunk[chunk['Country'] == country_name]
#
#
#    print(chunk.size)
#    print(part.size)
#    print('--')
    chunks.append(part)
#
#
df = pd.concat(chunks)
print(df.dtypes)
for value in ['Academic researcher','Data or business analyst', 'Data scientist or machine learning specialist','Database administrator','Designer', 'Developer, back-end',
              'Developer, desktop or enterprise applications','Developer, embedded applications or devices','Developer, front-end','Developer, full-stack','Developer, game or graphics', 'Developer, mobile','Developer, QA or test',
              'DevOps specialist','Educator','Engineer, data', 'Engineer, site reliability','Engineering manager', 'Marketing or sales professional', 'Product manager', 'Scientist',
              'Senior Executive (C-Suite, VP, etc.)', 'System administrator']:
#for value in ['Academic researcher','Data or business analyst', 'Designer']:
    print(value)
    #df[value]= df.apply(lambda row: 1, axis=1)
    #df[value]= df.apply(lambda row: value in str(row['DevType']), axis=1)
    df[value]= df.apply(lambda row: pd.notnull(row['DevType']) and value in row['DevType'], axis=1)

print(df.count())
print(df.size)
print(df)

examples/pandas/another_pandas.py

import pandas as pd
import matplotlib.pyplot as plt

filepath=r'survey_results_public.csv'

df=pd.read_csv(filepath)
print("The dataframe columns are:\n",list(df.columns))
print('-'*30)
#Let's check what kind of dtypes is in each column,
#if stats can be extracted - print it
for i in range(len(df.dtypes.index)):
    print(df.dtypes.index[i] , 'is of type ', df.dtypes[i])
    if df.dtypes[i]=='float64':
        print('*'*10,"\nAnd it's statistics:")
        print(df[df.dtypes.index[i]].describe())

#who is the most responsive country?
most_responsive_country=df['Country'].value_counts().index[0]
#now let's check what is the average working time per week for the most responsive country
most_responsive_country_df=df[df['Country']==most_responsive_country]
average_working_time_weekly=most_responsive_country_df['WorkWeekHrs'].mean()

#back to the original df, see what is the study fields distribution
#and then plotting it as a bar chart
study_fields_normalized=df['UndergradMajor'].value_counts(normalize=True)
fig,ax=plt.subplots()
ax.barh(list(study_fields_normalized.index),list(study_fields_normalized*100))
ax.set_xlabel("Relative Distribution")
fig.show()