Pandas / Statistics
Find values
Pandas has multiple built-in methods for descriptive statistics.
""" Statistics / Max, Min, Average
Can be applied to a column or to whole dataframe.
"""
import pandas as pd
import pathlib
DIR = pathlib.Path(__file__).resolve().parent
df = pd.read_csv(DIR / '../_data/titanic.csv')
# Statistics (by Age)
A = pd.DataFrame()
A['max'] = [df['Age'].max()]
A['min'] = [df['Age'].min()]
A['avg'] = [df['Age'].mean()]
print(A)
# Value counts (by PClass)
A = pd.DataFrame()
A['PClass'] = df['PClass'].value_counts()
print(A)
# Unique values (by Sex)
A = pd.DataFrame()
A['unique_value'] = df['Sex'].unique()
A['total'] = [df['Sex'].value_counts()[0], df['Sex'].value_counts()[1]]
print(A)
# Missing values (by Agge)
A = pd.DataFrame()
A = df[df['Age'].isnull()]
print("Missing values (Age):", A.size)
print(A.head())
"""
max min avg
0 71.0 0.17 30.397989
PClass
3rd 711
1st 322
2nd 279
* 1
unique_value total
0 female 851
1 male 462
Missing values (Age): 3342
Name PClass Age Sex Survived SexCode
12 Aubert, Mrs Leontine Pauline 1st NaN female 1 1
13 Barkworth, Mr Algernon H 1st NaN male 1 0
14 Baumann, Mr John D 1st NaN male 0 0
29 Borebank, Mr John James 1st NaN male 0 0
32 Bradley, Mr George 1st NaN male 1 0
"""
Group By
GroupBy is one of the most powerful feature in pandas.
""" Statistics / Group By
"""
import pandas as pd
import pathlib
DIR = pathlib.Path(__file__).resolve().parent
df = pd.read_csv(DIR / '../_data/titanic.csv')
A = df.groupby('Sex').count()
print(A)
A = df.groupby('Sex').mean(numeric_only=True)
print(A)
A = df.groupby('Sex')['Survived'].count()
print(A)
A = df.groupby(['Sex', 'Survived']).mean(numeric_only=True)
print(A)
"""
Name PClass Age Survived SexCode
Sex
female 462 462 288 462 462
male 851 851 468 851 851
Age Survived SexCode
Sex
female 29.396424 0.666667 1.0
male 31.014338 0.166863 0.0
Sex
female 462
male 851
Name: Survived, dtype: int64
Age SexCode
Sex Survived
female 0 24.901408 1.0
1 30.867143 1.0
male 0 32.320780 0.0
1 25.951875 0.0
"""
Last update: 53 days ago