In R, converting Wide dataframe to Long while retaining some information - r

I have a large data set that requires some converting but I am not sure what to do.
Let's say I have 2 participants in my study.
football_enjoyment <- c(5,3)
basketball_enjoyment <- c(5,5)
football_participation <- c(1,2)
basketball_participation <- c(1,3)
df<- data.frame(football_enjoyment,football_participation,
basketball_enjoyment,basketball_participation)
df$id <- seq.int(nrow(df))
df
## football_enjoyment football_participation basketball_enjoyment basketball_participation id
# 5 1 5 1 1
# 3 2 5 3 2
I want it to be like this
sports <- c("football","football", "basketball","basketball")
enjoyment_score <- c(5,3,5,5)
participation_score <- c(1,2,1,3)
id <- c(1,2)
df2 <- data.frame(sports, enjoyment_score,participation_score, id)
df2
## sports enjoyment_score participation_score id
# football 5 1 1
# football 3 2 2
# basketball 5 1 1
# basketball 5 3 2
I am stuck with the structure and the column/row names are just for demonstration purpose.

With tidyverse you could do:
library(tidyverse)
library(reshape2)
df %>% gather("variable", "value", - id) %>%
separate(variable, into = c("sports", "variable"), sep = "_") %>%
dcast(id + sports ~ variable) %>% arrange(desc(sports))
# id sports enjoyment participation
#1 1 football 5 1
#2 2 football 3 2
#3 1 basketball 5 1
#4 2 basketball 5 3
Or, in base you could do:
df2 <- reshape(df, varying = c("football_enjoyment", "football_participation", "basketball_enjoyment", "basketball_participation"),
direction = "long",
idvar = "id",
sep = "_",
timevar = "sports",
times = c("football", "basketball"), v.names = c('enjoyment', 'participation'))
rownames(df2) <- NULL
# id sports enjoyment participation
#1 1 football 5 1
#2 2 football 3 2
#3 1 basketball 5 1
#4 2 basketball 5 3

tidyr 1.0.0 has a pivot_longer function that can do this:
library(tidyr)
football_enjoyment <- c(5,3)
basketball_enjoyment <- c(5,5)
football_participation <- c(1,2)
basketball_participation <- c(1,3)
df<- data.frame(football_enjoyment,football_participation,
basketball_enjoyment,basketball_participation)
df$id <- seq.int(nrow(df))
df
#> football_enjoyment football_participation basketball_enjoyment
#> 1 5 1 5
#> 2 3 2 5
#> basketball_participation id
#> 1 1 1
#> 2 3 2
df %>% pivot_longer(-id, names_to = c("sports",".value"), names_sep = "_")
#> # A tibble: 4 x 4
#> id sports enjoyment participation
#> <int> <chr> <dbl> <dbl>
#> 1 1 football 5 1
#> 2 1 basketball 5 1
#> 3 2 football 3 2
#> 4 2 basketball 5 3
Created on 2019-09-20 by the reprex package (v0.3.0)

Related

Easy way to convert text columns to numeric

I have this example dataset
x <- c("hot", "cold", "warm", "hot", "hot")
y <- c("happy", "content", "happy", "sad", "annoyed")
df <- data.frame(x, y)
I want to find a quick way to convert the text to numbers, it doesn't matter which order the numbers are.
So the output would be:
x y
1 1
2 2
3 1
1 3
1 4
Many Thanks
With Base R:
df[] <- lapply(df, function(x) as.numeric(as.factor(x)))
df
#> x y
#> 1 2 3
#> 2 1 2
#> 3 3 3
#> 4 2 4
#> 5 2 1
With purrr:
library(purrr)
df %>% map(as.factor) %>% map_dfc(as.numeric)
#> # A tibble: 5 x 2
#> x y
#> <dbl> <dbl>
#> 1 2 3
#> 2 1 2
#> 3 3 3
#> 4 2 4
#> 5 2 1
Keep track of the labels with labelled:
df <- df %>% map(as.factor) %>% map_dfc(labelled::to_labelled)
df
#> # A tibble: 5 x 2
#> x y
#> <dbl+lbl> <dbl+lbl>
#> 1 2 [hot] 3 [happy]
#> 2 1 [cold] 2 [content]
#> 3 3 [warm] 3 [happy]
#> 4 2 [hot] 4 [sad]
#> 5 2 [hot] 1 [annoyed]
df$x
#> <labelled<double>[5]>
#> [1] 2 1 3 2 2
#>
#> Labels:
#> value label
#> 1 cold
#> 2 hot
#> 3 warm
Or keep the numbers next to the original values in a new column:
df[paste0(names(df), "_num")] <- lapply(df, function(x) as.numeric(as.factor(x)))
df
#> x y x_num y_num
#> 1 hot happy 2 3
#> 2 cold content 1 2
#> 3 warm happy 3 3
#> 4 hot sad 2 4
#> 5 hot annoyed 2 1
If you want to change only the character columns to numeric:
library(purrr)
df %>% map_if(is.character, as.factor) %>% map_dfc(as.numeric)
df %>% map_if(is.character, as.factor) %>% map_dfc(labelled::to_labelled)
Or choose them by name:
library(purrr)
cols <- c("x", "y")
df %>% map_at(cols, as.factor) %>% map_dfc(as.numeric)
df %>% map_at(cols, as.factor) %>% map_dfc(labelled::to_labelled)
df[paste0(cols, "_num")] <- lapply(df[cols], function(x) as.numeric(as.factor(x)))
You could use rapply:
rapply(type.convert(df), function(x)as.integer(factor(x, unique(x))),'factor',how = 'replace')
x y
1 1 1
2 2 2
3 3 1
4 1 3
5 1 4
Maybe try this with dplyr:
library(dplyr)
#Code
newdf <- df %>% mutate(across(everything(),~as.numeric(as.factor(.))))
Output:
x y
1 2 3
2 1 2
3 3 3
4 2 4
5 2 1
In order to see the values, you can try this:
#Code 2
newdf2 <- df %>% mutate(across(everything(),~as.factor(.))) %>%
mutate(across(everything(),.fns = list(value = ~ as.numeric(.))))
Output:
x y x_value y_value
1 hot happy 2 3
2 cold content 1 2
3 warm happy 3 3
4 hot sad 2 4
5 hot annoyed 2 1
If we add a numeric variable, this should work:
#Code 3
newdf <- df %>% mutate(across(x:y,~as.factor(.))) %>%
mutate(across(x:y,.fns = list(value = ~ as.numeric(.))))
Output:
x y number x_value y_value
1 hot happy 10 2 3
2 cold content 20 1 2
3 warm happy 30 3 3
4 hot sad 40 2 4
5 hot annoyed 50 2 1
We can use match
df[] <- lapply(df, function(x) match(x, unique(x)))

Gather serveral columns at once in r

I am trying to gather() a data.frame, but somehow it is not doing what I want.
This is my data:
df <- data.frame("id" = c(1),
"reco_1"= c(2),
"sim_1" = c(2),
"title_1"= c(2),
"reco_2" = c(3),
"sim_2" = c(3),
"title_2"= c(3))
And this is what it looks like printed:
> df
id reco_1 sim_1 title_1 reco_2 sim_2 title_2
1 1 2 2 2 3 3 3
When I now gather() my df, it looks like this:
> df %>% gather(reco, sim, -id)
id reco sim
1 1 reco_1 2
2 1 sim_1 2
3 1 title_1 2
4 1 reco_2 3
5 1 sim_2 3
6 1 title_2 3
However, what I would like to have is the following structure:
id reco sim title
1 1 2 2 2
2 2 3 3 3
I would appreciate any help, since I do not even know whether gather() is even the right verb for it.
We can use pivot_longer
library(dplyr)
library(tidyr)
df %>%
pivot_longer(-id, names_to = c(".value", "new_id"), names_sep = "_") %>%
select(-id)
# A tibble: 2 x 4
new_id reco sim title
<chr> <dbl> <dbl> <dbl>
1 1 2 2 2
2 2 3 3 3

Add original values for columns after group by

For the dataframe below I want to add the original values for Var_x after a group_by on ID and event and a max() on quest, but I cannot get my code right. Any suggestions? By the way, in my original dataframe more than 1 column needs to be added.
df <- data.frame(ID = c(1,1,1,1,1,1,2,2,2,3,3,3),
quest = c(1,1,2,2,3,3,1,2,3,1,2,3),
event = c("A","B","A","B","A",NA,"C","D","C","D","D",NA),
VAR_X = c(2,4,3,6,3,NA,6,4,5,7,5,NA))
Code:
df %>%
group_by(ID,event) %>%
summarise(quest = max(quest))
Desired output:
ID quest event VAR_X
1 1 2 B 6
2 1 3 A 3
3 2 2 D 4
4 2 3 C 5
5 3 2 D 5
Start by omiting the na values and in the end do an inner_join with the original data set.
df %>%
na.omit() %>%
group_by(ID, event) %>%
summarise(quest = max(quest)) %>%
inner_join(df, by = c("ID", "event", "quest"))
## A tibble: 5 x 4
## Groups: ID [3]
# ID event quest VAR_X
# <dbl> <fct> <dbl> <dbl>
#1 1 A 3 3
#2 1 B 2 6
#3 2 C 3 5
#4 2 D 2 4
#5 3 D 2 5
df %>%
drop_na() %>% # remove if necessary ..
group_by(ID, event) %>%
filter(quest == max(quest)) %>%
ungroup()
# A tibble: 5 x 4
# ID quest event VAR_X
#<dbl> <dbl> <chr> <dbl>
# 1 1 2 B 6
# 2 1 3 A 3
# 3 2 2 D 4
# 4 2 3 C 5
# 5 3 2 D 5

Using dplyr, count non-numeric grades in each class

Given the input and code below, using dplyr and groups, how can I produce the results shown in the output? I know how to sum columns in groups using dplyr, but in this case I need to count how many of each non-numeric grade occurred in each class.
**INPUT**
Class Student Grade
1 Jack C
1 Mary B
1 Mo B
1 Jane A
1 Tom C
2 Don C
2 Betsy B
2 Sue C
2 Tayna B
2 Kim C
**CODE**
# Create the dataframe
Class <- c(1,1,1,1,1,2,2,2,2,2)
Name <- c("Jack", "Mary", "Mo", "Jane", "Tom", "Don", "Betsy", "Sue", "Tayna", "Kim")
Grade <- c("C","B","B","A","C","C","B","C","B","C")
StudentGrades <- data.frame(Class, Name, Grade)
**OUTPUT**
Class Grade-A Grade-B Grade-C
1 1 2 2
2 0 2 3
We can use count to get the frequency count and then with pivot_wider change from 'long' to 'wide' format
library(dplyr)
library(tidyr)
library(stringr)
StudentGrades %>%
count(Class, Grade = str_c('Grade_', Grade)) %>%
pivot_wider(names_from = Grade, values_from = n, values_fill = list(n = 0))
# A tibble: 2 x 4
# Class Grade_A Grade_B Grade_C
# <dbl> <int> <int> <int>
#1 1 1 2 2
#2 2 0 2 3
Or in base R
table(StudentGrades[c('Class', 'Grade')])
Here is a base R solution, where table() + split() are used
dfout <- do.call(rbind,lapply(split(StudentGrades,StudentGrades$Class),
function(v) c(unique(v[1]),table(v$Grade))))
such that
> dfout
Class A B C
1 1 1 2 2
2 2 0 2 3

Add together 2 dataframes in R without losing columns

I have 2 dataframes in R (df1, df2).
A C D
1 1 1
2 2 2
df2 as
A B C
1 1 1
2 2 2
How can I merge these 2 dataframes to produce the following output?
A B C D
2 1 2 1
4 2 4 2
Columns are sorted and column values are added. Both DFs have same number of rows. Thank you in advance.
Code to create DF:
df1 <- data.frame("A" = 1:2, "C" = 1:2, "D" = 1:2)
df2 <- data.frame("A" = 1:2, "B" = 1:2, "C" = 1:2)
nm1 = names(df1)
nm2 = names(df2)
nm = intersect(nm1, nm2)
if (length(nm) == 0){ # if no column names in common
cbind(df1, df2)
} else { # if column names in common
cbind(df1[!nm1 %in% nm2], # columns only in df1
df1[nm] + df2[nm], # add columns common to both
df2[!nm2 %in% nm1]) # columns only in df2
}
# D A C B
#1 1 2 2 1
#2 2 4 4 2
You can try:
library(tidyverse)
list(df2, df1) %>%
map(rownames_to_column) %>%
bind_rows %>%
group_by(rowname) %>%
summarise_all(sum, na.rm = TRUE)
# A tibble: 2 x 5
rowname A B C D
<chr> <int> <int> <int> <int>
1 1 2 1 2 1
2 2 4 2 4 2
By using left_join() from dplyr you won't lose the column
library(tidyverse)
dat1 <- tibble(a = 1:10,
b = 1:10,
c = 1:10)
dat2 <- tibble(c = 1:10,
d = 1:10,
e = 1:10)
left_join(dat1, dat2, by = "c")
#> # A tibble: 10 x 5
#> a b c d e
#> <int> <int> <int> <int> <int>
#> 1 1 1 1 1 1
#> 2 2 2 2 2 2
#> 3 3 3 3 3 3
#> 4 4 4 4 4 4
#> 5 5 5 5 5 5
#> 6 6 6 6 6 6
#> 7 7 7 7 7 7
#> 8 8 8 8 8 8
#> 9 9 9 9 9 9
#> 10 10 10 10 10 10
Created on 2019-01-16 by the reprex package (v0.2.1)
allnames <- sort(unique(c(names(df1), names(df2))))
df3 <- data.frame(matrix(0, nrow = nrow(df1), ncol = length(allnames)))
names(df3) <- allnames
df3[,allnames %in% names(df1)] <- df3[,allnames %in% names(df1)] + df1
df3[,allnames %in% names(df2)] <- df3[,allnames %in% names(df2)] + df2
df3
A B C D
1 2 1 2 1
2 4 2 4 2
Here is a fun base R method with Reduce.
Reduce(cbind,
list(Reduce("+", list(df1[intersect(names(df1), names(df2))],
df2[intersect(names(df1), names(df2))])), # sum results
df1[setdiff(names(df1), names(df2))], # in df1, not df2
df2[setdiff(names(df2), names(df1))])) # in df2, not df1
This returns
A C D B
1 2 2 1 1
2 4 4 2 2
This assumes that both df1 and df2 have columns that are not present in the other. If this is not true, you'd have to adjust the list.
Note also that you could replace Reduce with do.call in both places and you'd get the same result.

Resources