I have a data frame of 300K rows with 2 main columns of interest. (NAME & SUBJCT) I need to convert this data into a wide format and in addition, if I get a records for a particular subject with multiple dates, I need to place them next to each other.
I tried using tidyr::pivot_wider but I'm not able to get it work.
Sample data:
DF <- data.frame(
NAME = c("ABC", "ABC", "DEF", "ABC", "ABC", "ABC", "DEF", "ABC", "DEF", "ABC", "DEF", "DEF", "DEF", "DEF", "DEF", "DEF", "ABC"),
SUBJECT = c("MATHS", "LANGUAGE 1", "LANGUAGE 1", "LANGUAGE 2","LANGUAGE 2","LANGUAGE 2","LANGUAGE 2", "SCIENCE", "SCIENCE", "HISTORY", "PE", "ENVIRONMENT", "COMPUTERS", "COMPUTERS", "COMPUTERS", "BIOLOGY", "SANSKRIT"),
YEAR = c("2010", "2011", "2012", "2013", "2014", "2015", "2013", "2015", "2016", "2016", "2017", "2015", "2016", "2017", "2018", "2015", "2013"),
MARKS = c("45", "48", "47", "44", "48", "46", "42", "42", "43", "37", "42", "43", "42", "41", "44", "41", "44"),
MAXIMUM = c("46", rep("50", 5), "45", "50", rep("45", 9))
)
> DF
NAME SUBJECT YEAR MARKS MAXIMUM
1 ABC MATHS 2010 45 46
2 ABC LANGUAGE 1 2011 48 50
3 DEF LANGUAGE 1 2012 47 50
4 ABC LANGUAGE 2 2013 44 50
5 ABC LANGUAGE 2 2014 48 50
6 ABC LANGUAGE 2 2015 46 50
7 DEF LANGUAGE 2 2013 42 45
8 ABC SCIENCE 2015 42 50
9 DEF SCIENCE 2016 43 45
10 ABC HISTORY 2016 37 45
11 DEF PE 2017 42 45
12 DEF ENVIRONMENT 2015 43 45
13 DEF COMPUTERS 2016 42 45
14 DEF COMPUTERS 2017 41 45
15 DEF COMPUTERS 2018 44 45
16 DEF BIOLOGY 2015 41 45
17 ABC SANSKRIT 2013 44 45
My expected output is like this: (It is a bit long)
Bit tricky with pivoting twice, but here you go:
library(tidyverse)
DF %>%
group_by(NAME, SUBJECT) %>%
mutate(ind = row_number()) %>%
ungroup() %>%
pivot_longer(c("YEAR", "MARKS", "MAXIMUM")) %>%
mutate(name = paste0(name, ind)) %>%
select(-ind) %>%
pivot_wider(names_from = c("SUBJECT", "name"), values_from = "value")
Related
Is there a quick way to replace variable names with the content of the first row of a tibble?
So turning something like this:
Subject Q1 Q2 Q3
Subject age gender cue
429753 24 1 man
b952x8 23 2 mushroom
264062 19 1 night
53082m 35 1 moon
Into this:
Subject age gender cue
429753 24 1 man
b952x8 23 2 mushroom
264062 19 1 night
53082m 35 1 moon
My dataset has over 100 variables so I'm looking for a way that doesn't involve typing out each old and new variable name.
A possible solution:
df <- structure(list(Subject = c("Subject", "429753", "b952x8", "264062",
"53082m"), Q1 = c("age", "24", "23", "19", "35"), Q2 = c("gender",
"1", "2", "1", "1"), Q3 = c("cue", "man", "mushroom", "night",
"moon")), row.names = c(NA, -5L), class = "data.frame")
names(df) <- df[1,]
df <- df[-1,]
df
#> Subject age gender cue
#> 2 429753 24 1 man
#> 3 b952x8 23 2 mushroom
#> 4 264062 19 1 night
#> 5 53082m 35 1 moon
I have a table which looks like this:
df1 <- data.frame(
"seqid" = c("12", "12", "13", "12", "12", "15"),
"source" = c("star", "star", "star", "star", "star", "star"),
"type" = c("CDS", "CDS", "CDS", "intron", "CDS", "intron"),
"start" = c("15", "21", "23", "35", "45", "60"),
"end" = c("70", "80", "86", "45", "67", "88"),
"attributes" = c("ENSOCUT00000011013", "ENSOCUT00000064484",
"ENSOCUT00000013302",
"ENSOCUT00000010968", "ENSOCUT00000010968", "ENSOCUT00000060283"),
stringsAsFactors = F,check.names=FALSE)
seqid
source
Type
start
end
attributes
12
star
CDS
15
70
ENSOCUT00000011013
12
star
CDS
21
80
ENSOCUT00000064484
12
star
CDS
23
86
ENSOCUT00000013302
12
star
intron
35
45
ENSOCUT00000010968
12
star
CDS
45
67
ENSOCUT00000010968
12
star
intron
60
88
ENSOCUT00000060283
I want my final result to look like this:
seqid
source
Type
start
end
attributes
12
star
CDS
15
70
ENSOCUT00000011013
12
star
CDS
21
80
ENSOCUT00000064484
12
star
CDS
23
86
ENSOCUT00000013302
12
star
CDS
45
67
ENSOCUT00000010968
So I want to group_by by only ENSOCUT00000011013, ENSOCUT00000064484, ENSOCUT00000013302, ENSOCUT00000010968 (on attributes column) and keep only the CDS of those on the type column
I have a table which looks like this:
df1 <- data.frame(
"seqid" = c("12", "12", "13", "12", "12", "15"),
"source" = c("star", "star", "star", "star", "star", "star"),
"type" = c("CDS", "CDS", "CDS", "intron", "CDS", "intron"),
"start" = c("15", "21", "23", "35", "45", "60"),
"end" = c("70", "80", "86", "45", "67", "88"),
"attributes" = c("ENSOCUT00000011013", "ENSOCUT00000064484",
"ENSOCUT00000013302",
"ENSOCUT00000010968", "ENSOCUT00000010968", "ENSOCUT00000060283"),
stringsAsFactors = F,check.names=FALSE)
seqid
source
Type
start
end
attributes
12
star
CDS
15
70
ENSOCUT00000011013
12
star
CDS
21
80
ENSOCUT00000064484
12
star
CDS
23
86
ENSOCUT00000013302
12
star
intron
35
45
ENSOCUT00000010968
12
star
CDS
45
67
ENSOCUT00000010968
12
star
intron
60
88
ENSOCUT00000060283
And I want to extract only rows 1, 2, 3, 5 to have a final result looking like this:
seqid
source
Type
start
end
attributes
12
star
CDS
15
70
ENSOCUT00000011013
12
star
CDS
21
80
ENSOCUT00000064484
12
star
CDS
23
86
ENSOCUT00000013302
12
star
CDS
45
67
ENSOCUT00000010968
df1[c(1,2,3,5),]
In general to select numbered rows/columns in the brackets of a data.frame df:
df[rows_selected_go_here, columns_selected_go_here]
I assume that you only want those entries from df where Type (string) equals CDS
library(tidyverse)
df <- mutate(df, TOBINCL= 0)
df$TOBINCL[grepl("^CDS$", df$Type, ignore.case = TRUE)] <- 1
mynewdf <- df[(df$TOBINCL==1) , ]
I have demographic data set, which includes the age of people in a household. This is collected via a survey and participants are allowed to refuse providing their age.
The result is a data set with one household per row (each with a household ID code), and various household characteristics such as age in the columns. Refused responses as coded as "R", and you could re-create a sample using the code below:
df <- list(Household_ID = c("1A", "1B", "1C", "1D", "1E"),
AGE1 = c("25", "47", "39", "50", "R"),
AGE2 = c("66", "23", "71", "R", "16"),
AGE3 = c("28", "17", "R", "R", "80"),
AGE4 = c("81", "22", "48", "59", "R"))
df <- as_tibble(df)
> df
# A tibble: 5 x 5
Household_ID AGE1 AGE2 AGE3 AGE4
<chr> <chr> <chr> <chr> <chr>
1 1A 25 66 28 81
2 1B 47 23 17 22
3 1C 39 71 R 48
4 1D 50 R R 59
5 1E R 16 80 R
For our intents and purposes we re-code the "R" to "-9" so that we can subsequently convert the format of the AGE columns to integer, and carry out analysis. We usually do this in another software and my objective is to replicate this process in R.
I have managed to do this with the following code:
df <- df %>% mutate(AGE1 = case_when(AGE1 == "R" ~ "-9", TRUE ~ as.character(AGE1)))
df <- df %>% mutate(AGE2 = case_when(AGE2 == "R" ~ "-9", TRUE ~ as.character(AGE2)))
df <- df %>% mutate(AGE3 = case_when(AGE3 == "R" ~ "-9", TRUE ~ as.character(AGE3)))
df <- df %>% mutate(AGE4 = case_when(AGE4 == "R" ~ "-9", TRUE ~ as.character(AGE4)))
Given that this feels clumsy, I tried to find a solution using mutate_if etc. but read that these have been superseded by across(). Hence, I tried to replicate this operation using across():
df <- df %>%
mutate(across(AGE1:AEG4),
~ (case_when(. == "R" ~ "-9")))
But I get the following error:
Error: Problem with `mutate()` input `..2`.
x Input `..2` must be a vector, not a `formula` object.
i Input `..2` is `~(case_when(. == "R" ~ "-9"))`.
Been wrestling with this and googling for a while now but can't figure out what I am missing. Would really appreciate some input on how to get this working, please and thank you.
EDIT: Solved!
df <- df %>%
mutate(across(AGE1:AGE4, ~ (case_when(.x == "R" ~ "-9", TRUE ~ as.character(.x)))))
Or maybe this one which is not much difference from dear #TarJae's interpretation:
library(dplyr)
library(stringr)
df %>%
mutate(across(AGE1:AGE4, ~ str_replace(., "R", "-9")),
across(AGE1:AGE4, as.integer))
# A tibble: 5 x 5
Household_ID AGE1 AGE2 AGE3 AGE4
<chr> <int> <int> <int> <int>
1 1A 25 66 28 81
2 1B 47 23 17 22
3 1C 39 71 -9 48
4 1D 50 -9 -9 59
5 1E -9 16 80 -9
Data:
df <- list(Household_ID = c("1A", "1B", "1C", "1D", "1E"),
AGE1 = c("25", "47", "39", "50", "R"),
AGE2 = c("66", "23", "71", "R", "16"),
AGE3 = c("28", "17", "R", "R", "80"),
AGE4 = c("81", "22", "48", "59", "R"))
df <- as_tibble(df)
Why not simply?
df[,2:5][df[, 2:5] == 'R'] <- '-9'
# A tibble: 5 x 5
Household_ID AGE1 AGE2 AGE3 AGE4
<chr> <chr> <chr> <chr> <chr>
1 1A 25 66 28 81
2 1B 47 23 17 22
3 1C 39 71 -9 48
4 1D 50 -9 -9 59
5 1E -9 16 80 -9
You could use across with replace.
List to tibble with as_tibble()
replace R with -9
integer class for AGE
df %>%
as_tibble() %>%
mutate(across(everything(), ~replace(., . == "R" , "-9"))) %>%
type.convert(as.is=TRUE)
Output:
Household_ID AGE1 AGE2 AGE3 AGE4
<chr> <int> <int> <int> <int>
1 1A 25 66 28 81
2 1B 47 23 17 22
3 1C 39 71 -9 48
4 1D 50 -9 -9 59
5 1E -9 16 80 -9
How could I add rows with the sum of VL-FOB_real for each CO_ANO-niv100-subsector group in an easier way? I couldn't figure how to use add_rows and the like to do so, only by creating a new dataframe and then appending it.
Here is what I have done:
df <- structure(list(CO_ANO = c("1996", "1990", "1993", "1993", "1994",
"1992", "1995", "1995", "1996", "1995",
"1994", "1990", "1989", "1992", "1995"),
CO_UF = c("32", "45", "45", "36", "55", "99", "36",
"34", "14", "25", "53", "41", "41", "41", "16"),
niv100 = c("2210","1530", "210", "3210", "1530", "2610", "2210",
"2630", "1030","1020", "3020", "3020", "410", "2510",
"1520"),
subsector = c("11","8", "1", "7", "8", "13", "11", "13", "4", "5",
"13", "13", "2","13", "8"),
VL_FOB_real = c(1, 2, 3,
1, 4, 5,
5, 6, 7,
6, 8, 9,
10, 11, 11)),
class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA,-15L))
df1 <- df %>%
group_by(CO_ANO, subsector, niv100) %>%
summarise(VL_FOB_real = sum(VL_FOB_real)) %>%
mutate(CO_UF = 'Total')
df <- bind_rows(df1,df)
This groups the rows and then modify each group using adorn_totals.
library(dplyr)
library(janitor)
df %>%
group_by(CO_ANO, CO_UF, niv100) %>%
group_modify(~ adorn_totals(.x, where = "row"))
giving:
# A tibble: 30 x 5
# Groups: CO_ANO, CO_UF, niv100 [15]
CO_ANO CO_UF niv100 subsector VL_FOB_real
<chr> <chr> <chr> <chr> <dbl>
1 1989 41 410 2 10
2 1989 41 410 Total 10
3 1990 41 3020 13 9
4 1990 41 3020 Total 9
5 1990 45 1530 8 2
6 1990 45 1530 Total 2
7 1992 41 2510 13 11
8 1992 41 2510 Total 11
9 1992 99 2610 13 5
10 1992 99 2610 Total 5
# ... with 20 more rows
Another thing to try is the following which gives somewhat different output. It splits the input into groups and applies adorn_totals separately to each group giving a c("tabyl", "tbl_df", "tbl", "data.frame") object.
library(dplyr)
library(janitor)
library(purrr)
df %>%
group_split(CO_ANO, subsector, niv100, CO_UF) %>%
map_df(adorn_totals)
Honestly, I would do what you have done to add rows for each group but for the purpose of demonstrating way to use add_row here's an answer :
library(dplyr)
library(purrr)
df %>%
group_split(CO_ANO, subsector, niv100) %>%
map_df(~add_row(.x, CO_ANO = first(.x$CO_ANO), subsector = first(.x$subsector),
niv100 = first(.x$niv100),VL_FOB_real = sum(.x$VL_FOB_real), CO_UF = 'Total'))
# CO_ANO CO_UF niv100 subsector VL_FOB_real
# <chr> <chr> <chr> <chr> <dbl>
# 1 1989 41 410 2 10
# 2 1989 Total 410 2 10
# 3 1990 41 3020 13 9
# 4 1990 Total 3020 13 9
# 5 1990 45 1530 8 2
# 6 1990 Total 1530 8 2
# 7 1992 41 2510 13 11
# 8 1992 Total 2510 13 11
# 9 1992 99 2610 13 5
#10 1992 Total 2610 13 5
# … with 20 more rows
The only benefit I see of this approach is you get "Total" row for each group immediately after the group unlike in bind_rows where you get all "Total" rows together.