R
iris %>%
select(Species, Sepal.Width, Sepal.Length) %>%
filter(Sepal.Width >= 3, Species %in% c("setosa", "virginica")) %>%
mutate(
Sepal_Ratio = round(Sepal.Width/Sepal.Length, 4),
small_ratio = ifelse(Sepal_Ratio < 0.7, 1, 0)) %>%
arrange(Species, desc(Sepal.Length)) %>%
head()
Species Sepal.Width Sepal.Length Sepal_Ratio small_ratio
1 setosa 4.0 5.8 0.6897 1
2 setosa 4.4 5.7 0.7719 0
3 setosa 3.8 5.7 0.6667 1
4 setosa 4.2 5.5 0.7636 0
5 setosa 3.5 5.5 0.6364 1
6 setosa 3.9 5.4 0.7222 0
Python
(iris
.filter(["Species", "Sepal.Width", "Sepal.Length"])
.query('(`Sepal.Width` >= 3) & (Species in ["setosa", "virginica"])')
.assign(
Sepal_Ratio = lambda x: round( x['Sepal.Width'] / x['Sepal.Length'], 4),
small_ratio = lambda x: np.where(x['Sepal_Ratio'], 1, 0))
.sort_values(["Species", "Sepal.Length"], ascending = [True, False])
.head() )
Species Sepal.Width Sepal.Length Sepal_Ratio small_ratio
14 setosa 4.0 5.8 0.6897 1
15 setosa 4.4 5.7 0.7719 1
18 setosa 3.8 5.7 0.6667 1
33 setosa 4.2 5.5 0.7636 1
36 setosa 3.5 5.5 0.6364 1
In this article you will see the same analysis preformed in R & Python. Note the similarities and differences.
R
dat1 <- dat %>%
select(!c(`Course Program`, `Catalog`, `Grade Type`)) %>%
filter(!str_starts(Grade, "N/A")) %>%
mutate(
Grade = fct_relevel(Grade, c("A", "A-", "B+", "B", "B-", "C+", "C", "C-", "D+", "D", "D-", "F", "P", "W")),
Semester = fct_relevel(Semester, "WI19", "SP19", "FA19", "WI20", "SP20", "FA20", "WI21", "SP21", "FA21", "WI22", "SP22", "FA22"),
`Grade Points` = case_when(
Grade == "A" ~ 4,
Grade == "A-" ~ 3.7,
Grade == "B+" ~ 3.4,
Grade == "B" ~ 3,
Grade == "B-" ~ 2.7,
Grade == "C+" ~ 2.4,
Grade == "C" ~ 2,
Grade == "C-" ~ 1.7,
Grade == "D+" ~ 0.4,
Grade == "D" ~ 1,
Grade == "D-" ~ 0.7,
Grade == "F" ~ 0,
Grade == "UW" ~ 0,
Grade == "P" ~ NA_real_,
Grade == "W" ~ NA_real_,
Grade == "I" ~ NA_real_,
Grade == "T" ~ NA_real_,
Grade == "NR" ~ NA_real_),
Course = str_replace(Course, "-\\d\\d", ""),
Course = str_replace(Course, "\\W", ""),
is_math = case_when(Course %in% is_math ~ 1, TRUE ~ 0),
is_stats = case_when(Course %in% is_stats ~ 1, TRUE ~ 0),
is_major = case_when(Course %in% is_major ~ 1, TRUE ~ 0),
is_general_ed = case_when(Course %in% is_general_ed ~ 1, TRUE ~ 0)
)
# A tibble: 6 × 10
Course `Course Title` Credits Grade Semester `Grade Points` is_math is_stats
<chr> <chr> <dbl> <fct> <fct> <dbl> <dbl> <dbl>
1 DANCE18… Social Dance,… 1 P WI19 NA 0 0
2 FDAMF101 American Foun… 3 A WI19 4 0 0
3 FDENG101 Writing & Rea… 3 A WI19 4 0 0
4 FDMAT108 Math for the … 3 A WI19 4 1 0
5 FDREL275 Teachings of … 2 A WI19 4 0 0
6 GS106A College Succe… 1 A WI19 4 0 0
# ℹ 2 more variables: is_major <dbl>, is_general_ed <dbl>
Python
dat1 = (dat
.drop(['Course Program', 'Catalog', 'Grade Type'], axis = 1)
.query("~Grade.str.startswith('N/A')")
.assign(
Grade = lambda X: pd.Categorical(X.Grade, categories = ["A", "A-", "B+", "B", "B-", "C+", "C", "C-", "D+", "D", "D-", "F", "P", "W"]),
Semester = lambda X: pd.Categorical(X.Semester, ["WI19", "SP19", "FA19", "WI20", "SP20", "FA20", "WI21", "SP21", "FA21", "WI22", "SP22", "FA22"]),
Grade_Points = lambda X: X['Grade'].replace({
"A": 4,
"A-": 3.7,
"B+": 3.4,
"B": 3,
"B-": 2.7,
"C+": 2.4,
"C": 2,
"C-": 1.7,
"D+": 0.4,
"D": 1,
"D-": 0.7,
"F": 0,
"UW": 0,
"P": np.nan,
"W": np.nan,
"I": np.nan,
"T": np.nan,
"NR": np.nan}),
Course = lambda X: X['Course'].replace({r"-\d\d": "", r"\W":""}, regex=True),
is_math = lambda X: np.where(X['Course'].isin(is_math), True, False),
is_stats = lambda X: np.where(X['Course'].isin(is_stats), True, False),
is_major = lambda X: np.where(X['Course'].isin(is_major), 1, 0),
is_general_ed = lambda X: np.where(X['Course'].isin(is_general_ed), 1, 0)
)
)
Course Course Title ... is_major is_general_ed
0 DANCE180M Social Dance, Beginning ... 0 0
1 FDAMF101 American Foundations ... 0 1
2 FDENG101 Writing & Reasoning Foundations ... 0 1
3 FDMAT108 Math for the Real World ... 0 0
4 FDREL275 Teachings of Book of Mormon ... 0 0
[5 rows x 10 columns]
Reading in Data
Wrangling Verbs
What we are familiar with in R (Tidyverse):
select()
filter()
group_by()
summarise()
arrange()
Here are the Python (Pandas) translations:
filter() .
.query()
.groupby()
.agg()
.sort_values()
Visualization
- BaseR
- ggplot2
Package Options Include:
- Altair (Very good plotting package, based on the grammer of graphics)
- Plotnine (This is how to use ggplot2 in python)
- Seaborn (Good plotting package, does not follow the grammer of graphics)
- Plotly (Good plotting package, does not follow the grammer of graphics)
- Matplotlib/PyPlot (Good, basic plotting package)
Question 1
Grab the
age
,sex
, andcharges
columnsFilter the data to only females
Sort the data by age decending
Code Solution
age sex charges
46 18 female 3393.35635
728 18 female 2217.60120
803 18 female 38792.68560
942 18 female 2217.46915
50 18 female 2211.13075
.. ... ... ...
768 64 female 14319.03100
94 64 female 47291.05500
664 64 female 27037.91410
801 64 female 14313.84630
890 64 female 29330.98315
[662 rows x 3 columns]
Question 2
Grab the
region
andcharges
columnsFilter the data to charges greater than 2000
Group by region and find the total charges by region
Code Solution
Sum_Charges Total_Charges
region
northeast 4.343669e+06 324
northwest 4.035712e+06 325
southeast 5.363690e+06 364
southwest 4.012755e+06 325