R

library(tidyverse)
library(knitr)

iris %>% 
  select(Species, Sepal.Width, Sepal.Length) %>% 
  filter(Sepal.Width >= 3, Species %in% c("setosa", "virginica")) %>% 
  mutate(
    Sepal_Ratio = round(Sepal.Width/Sepal.Length, 4), 
    small_ratio = ifelse(Sepal_Ratio < 0.7, 1, 0)) %>%
  arrange(Species, desc(Sepal.Length)) %>% 
  head()

  Species Sepal.Width Sepal.Length Sepal_Ratio small_ratio
1  setosa         4.0          5.8      0.6897           1
2  setosa         4.4          5.7      0.7719           0
3  setosa         3.8          5.7      0.6667           1
4  setosa         4.2          5.5      0.7636           0
5  setosa         3.5          5.5      0.6364           1
6  setosa         3.9          5.4      0.7222           0

Python

import pandas as pd
import numpy as np

(iris
  .filter(["Species", "Sepal.Width", "Sepal.Length"])
  .query('(`Sepal.Width` >= 3) & (Species in ["setosa", "virginica"])')
  .assign(
    Sepal_Ratio = lambda x: round( x['Sepal.Width'] / x['Sepal.Length'], 4), 
    small_ratio = lambda x: np.where(x['Sepal_Ratio'], 1, 0))
  .sort_values(["Species", "Sepal.Length"], ascending = [True, False])
  .head() )

   Species  Sepal.Width  Sepal.Length  Sepal_Ratio  small_ratio
14  setosa          4.0           5.8       0.6897            1
15  setosa          4.4           5.7       0.7719            1
18  setosa          3.8           5.7       0.6667            1
33  setosa          4.2           5.5       0.7636            1
36  setosa          3.5           5.5       0.6364            1

In this article you will see the same analysis preformed in R & Python. Note the similarities and differences.

R

library(tidyverse)


dat <- read_csv("C:/Users/Bethany/Downloads/grades.csv")

is_math <- c("FDMAT108", "MATH101")
is_stats <- c("MATH221A", "MATH325")
is_major <- c("CIT111", "CIT225")
is_general_ed <- c("FDAMF101", "FDENG101")

dat1 <- dat %>% 
  select(!c(`Course Program`, `Catalog`, `Grade Type`)) %>% 
  filter(!str_starts(Grade, "N/A")) %>% 
  mutate(
    Grade = fct_relevel(Grade, c("A", "A-", "B+", "B", "B-", "C+", "C", "C-", "D+", "D", "D-", "F", "P", "W")), 
    Semester = fct_relevel(Semester, "WI19", "SP19", "FA19", "WI20", "SP20", "FA20", "WI21", "SP21", "FA21", "WI22", "SP22", "FA22"), 
    `Grade Points` = case_when(
      Grade == "A" ~ 4, 
      Grade == "A-" ~ 3.7, 
      Grade == "B+" ~ 3.4, 
      Grade == "B" ~ 3,
      Grade == "B-" ~ 2.7, 
      Grade == "C+" ~ 2.4,
      Grade == "C" ~ 2, 
      Grade == "C-" ~ 1.7,
      Grade == "D+" ~ 0.4, 
      Grade == "D" ~ 1,
      Grade == "D-" ~ 0.7, 
      Grade == "F" ~ 0,
      Grade == "UW" ~ 0, 
      Grade == "P" ~ NA_real_,
      Grade == "W" ~ NA_real_, 
      Grade == "I" ~ NA_real_,
      Grade == "T" ~ NA_real_, 
      Grade == "NR" ~ NA_real_), 
    Course = str_replace(Course, "-\\d\\d", ""), 
    Course = str_replace(Course, "\\W", ""), 
    is_math = case_when(Course %in% is_math ~ 1, TRUE ~ 0),
    is_stats = case_when(Course %in% is_stats ~ 1, TRUE ~ 0), 
    is_major = case_when(Course %in% is_major ~ 1, TRUE ~ 0), 
    is_general_ed = case_when(Course %in% is_general_ed ~ 1, TRUE ~ 0)
)

head(dat1)

# A tibble: 6 × 10
  Course   `Course Title` Credits Grade Semester `Grade Points` is_math is_stats
  <chr>    <chr>            <dbl> <fct> <fct>             <dbl>   <dbl>    <dbl>
1 DANCE18… Social Dance,…       1 P     WI19                 NA       0        0
2 FDAMF101 American Foun…       3 A     WI19                  4       0        0
3 FDENG101 Writing & Rea…       3 A     WI19                  4       0        0
4 FDMAT108 Math for the …       3 A     WI19                  4       1        0
5 FDREL275 Teachings of …       2 A     WI19                  4       0        0
6 GS106A   College Succe…       1 A     WI19                  4       0        0
# ℹ 2 more variables: is_major <dbl>, is_general_ed <dbl>

Python

import pandas as pd
import numpy as np
from plotnine import *
dat = pd.read_csv("C:/Users/Bethany/Downloads/grades.csv")

is_math = ("FDMAT108", "MATH101")
is_stats = ("MATH221A", "MATH325")
is_major = ("CIT111", "CIT225")
is_general_ed = ("FDAMF101", "FDENG101")

dat1 = (dat
    .drop(['Course Program', 'Catalog', 'Grade Type'], axis = 1)
    .query("~Grade.str.startswith('N/A')")
    .assign(
        Grade = lambda X: pd.Categorical(X.Grade, categories = ["A", "A-", "B+", "B", "B-", "C+", "C", "C-", "D+", "D", "D-", "F", "P", "W"]), 
        Semester = lambda X: pd.Categorical(X.Semester, ["WI19", "SP19", "FA19", "WI20", "SP20", "FA20", "WI21", "SP21", "FA21", "WI22", "SP22", "FA22"]),
        Grade_Points = lambda X: X['Grade'].replace({
            "A": 4, 
            "A-": 3.7,
            "B+": 3.4, 
            "B": 3,
            "B-": 2.7, 
            "C+": 2.4,
            "C": 2, 
            "C-": 1.7,
            "D+": 0.4, 
            "D": 1,
            "D-": 0.7, 
            "F": 0,
            "UW": 0, 
            "P": np.nan,
            "W": np.nan, 
            "I": np.nan,
            "T": np.nan, 
            "NR": np.nan}),
        Course = lambda X: X['Course'].replace({r"-\d\d": "", r"\W":""}, regex=True),
        is_math = lambda X: np.where(X['Course'].isin(is_math), True, False), 
        is_stats = lambda X: np.where(X['Course'].isin(is_stats), True, False), 
        is_major = lambda X: np.where(X['Course'].isin(is_major), 1, 0), 
        is_general_ed = lambda X: np.where(X['Course'].isin(is_general_ed), 1, 0)
        )
)

dat1.head()

      Course                     Course Title  ...  is_major is_general_ed
0  DANCE180M          Social Dance, Beginning  ...         0             0
1   FDAMF101             American Foundations  ...         0             1
2   FDENG101  Writing & Reasoning Foundations  ...         0             1
3   FDMAT108          Math for the Real World  ...         0             0
4   FDREL275      Teachings of Book of Mormon  ...         0             0

[5 rows x 10 columns]

Reading in Data

R (Dplyr)

library(tidyverse)
dat1 <- read_csv()

Python (Pandas)

import pandas as pd
dat1 = pd.read_csv()

Wrangling Verbs

What we are familiar with in R (Tidyverse):

select()

filter()

group_by()

summarise()

arrange()

Here are the Python (Pandas) translations:

.filter()

.query()

.groupby()

.agg()

.sort_values()

Visualization

BaseR
ggplot2

Package Options Include:

Altair (Very good plotting package, based on the grammer of graphics)
Plotnine (This is how to use ggplot2 in python)
Seaborn (Good plotting package, does not follow the grammer of graphics)
Plotly (Good plotting package, does not follow the grammer of graphics)
Matplotlib/PyPlot (Good, basic plotting package)

Click to download

Question 1

Grab the age, sex, and charges columns
Filter the data to only females
Sort the data by age decending

Code Solution

import pandas as pd

dat = pd.read_csv("insurance.csv")

(dat
  .filter(['age', 'sex', 'charges'], axis = 1)
  .query('sex == "female"')
  .sort_values('age')
)

     age     sex      charges
46    18  female   3393.35635
728   18  female   2217.60120
803   18  female  38792.68560
942   18  female   2217.46915
50    18  female   2211.13075
..   ...     ...          ...
768   64  female  14319.03100
94    64  female  47291.05500
664   64  female  27037.91410
801   64  female  14313.84630
890   64  female  29330.98315

[662 rows x 3 columns]

Question 2

Grab the region and charges columns
Filter the data to charges greater than 2000
Group by region and find the total charges by region

Code Solution

import pandas as pd

dat = pd.read_csv("insurance.csv")

(dat
  .filter(["charges", 'region'], axis = 1)
  .query('charges > 1000')
  .groupby(['region'])
  .agg(
    Sum_Charges = ('charges', 'sum'),
    Total_Charges = ('charges', 'count'))
)

            Sum_Charges  Total_Charges
region                                
northeast  4.343669e+06            324
northwest  4.035712e+06            325
southeast  5.363690e+06            364
southwest  4.012755e+06            325