I have a dataframe which looks like this:
`Row Labels` Female Male
<chr> <chr> <chr>
1 London <NA> <NA>
2 42 <NA> 1
3 Paris <NA> <NA>
4 36 1 <NA>
5 Belgium <NA> <NA>
6 18 1
7 21 <NA> 1
8 Madrid <NA> <NA>
9 20 1 <NA>
10 Berlin <NA> <NA>
11 37 <NA> 1
12 23 1
13 25 1
14 44 1
The code I used to produce this dataframe looks like this:
structure(list(`Row Labels` = c("London", "42", "Paris","36", "Belgium","18" ,"21", "Madrid", "20", "Berlin", "37","23","25","44"),
Female = c(NA, NA, NA, "1", NA, NA,NA, NA, "1", NA, NA,"1","1","1"), Male = c(NA,"1", NA, NA, NA, "1", NA, NA, NA, "1",NA,NA,NA,NA)),
.Names = c("Row Labels","Female", "Male"), row.names = c(NA, -14L), class = c("tbl_df", "tbl", "data.frame"))
I would like to know how I can change multiple rows in this dataframe to become columns.
My ideal output looks like this:
'Row Labels' Female Male 42 36 21 20 37 18 23 25 44
London 1 1
Paris 1 1
Belgium 1 1 1 1
Madrid 1 1
Berlin 3 1 1 1 1 1
Seems very mechanical. Calling your data d:
d1 = d[seq(1, nrow(d), by = 2), ]
d2 = d[seq(2, nrow(d), by = 2), ]
d1[, c("Male", "Female")] = d2[, c("Male", "Female")]
d3 = matrix(nrow = nrow(d2), ncol = nrow(d2))
diag(d3) = 1
colnames(d3) = d2$`Row Labels`
cbind(d2, d3)
# Row Labels Female Male 42 36 21 20 37
# 1 42 <NA> 1 1 NA NA NA NA
# 2 36 1 <NA> NA 1 NA NA NA
# 3 21 <NA> 1 NA NA 1 NA NA
# 4 20 1 <NA> NA NA NA 1 NA
# 5 37 <NA> 1 NA NA NA NA 1
Using tidyverse.
library(dplyr)
library(tidyr)
#cumsum based on country names
df %>% group_by(gr=cumsum(grepl('\\D+',`Row Labels`))) %>%
#Sum Female and Male
mutate_at(vars('Female','Male'), list(~sum(as.numeric(.), na.rm = T))) %>%
#Create RL from country name and number where we are at numbers
mutate(RL=ifelse(row_number()>1, paste0(first(`Row Labels`),',',`Row Labels`), NA)) %>%
filter(!is.na(RL)) %>%
select(RL, gr, Male, Female) %>%
separate(RL, into = c('RL','Age')) %>% mutate(flag=1) %>% spread(Age, flag) %>%
ungroup() %>% select(-gr)
# A tibble: 5 x 12
RL Male Female `18` `20` `21` `23` `25` `36` `37` `42` `44`
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 Belgium 1 0 1 NA 1 NA NA NA NA NA NA
2 Berlin 1 3 NA NA NA 1 1 NA 1 NA 1
3 London 1 0 NA NA NA NA NA NA NA 1 NA
4 Madrid 0 1 NA 1 NA NA NA NA NA NA NA
5 Paris 0 1 NA NA NA NA NA 1 NA NA NA
Related
Here is some example data:
transactions <- structure(list(id = c(1, 2, 3, 4, 5, 6, 7, 8), day = c("day1",
"day2", "day3", "day4", "day5", "day6", "day7", "day8"), sent_to = c(NA,
"Garden Cinema", "Pasta House", NA, "Blue Superstore", NA, NA,
NA), received_from = c("ATM", NA, NA, "Sarah", NA, "Jane", "Joe",
"Emily"), reference = c("add_cash", "cinema_tickets", "meal",
"gift", "shopping", "reimbursed", "reimbursed", "reimbursed"),
decrease = c(NA, 10.8, 12.5, NA, 15.25, NA, NA, NA), increase = c(50,
NA, NA, 30, NA, 5.4, 7.25, 2), reimbursed_id = c(NA, "R",
"R", NA, NA, "2", "3", "3")), class = c("tbl_df", "tbl",
"data.frame"), row.names = c(NA, -8L))
# id day sent_to received_from reference decrease increase reimbursed_id
# <dbl> <chr> <chr> <chr> <chr> <dbl> <dbl> <chr>
# 1 1 day1 NA ATM add_cash NA 50 NA
# 2 2 day2 Garden Cinema NA cinema_tickets 10.8 NA R
# 3 3 day3 Pasta House NA meal 12.5 NA R
# 4 4 day4 NA Sarah gift NA 30 NA
# 5 5 day5 Blue Superstore NA shopping 15.2 NA NA
# 6 6 day6 NA Jane reimbursed NA 5.4 2
# 7 7 day7 NA Joe reimbursed NA 7.25 3
# 8 8 day8 NA Emily reimbursed NA 2 3
Note that this is linked to a question I have previously asked here:
How to conditionally select a column, and subtract values in those rows from rows in another conditionally selected column in R?
I would like a similar solution to the above question but this time accounting for the fact that there are multiple people who are reimbursing the user for the same day.
This is the desired outcome:
# id day sent_to received_from reference decrease increase reimbursed_id actual_decrease
# <int> <chr> <chr> <chr> <chr> <dbl> <dbl> <chr> <dbl>
# 1 1 day1 NA ATM add_cash NA 50 NA NA
# 2 2 day2 Garden Cinema NA cinema_tickets 10.8 NA R 5.4
# 3 3 day3 Pasta House NA meal 12.5 NA R 3.25
# 4 4 day4 NA Sarah gift NA 30 NA NA
# 5 5 day5 Blue Superstore NA shopping 15.2 NA NA 15.2
# 6 6 day6 NA Jane reimbursed NA 5.4 2 NA
# 7 7 day7 NA Joe reimbursed NA 7.25 3 NA
# 8 8 day8 NA Emily reimbursed NA 2 3 NA
Any help is appreciated :)
Well you should first summarise the increase per id:
increase_df <- transactions %>%
filter(!is.na(as.numeric(reimbursed_id))) %>%
group_by(id = as.numeric(reimbursed_id)) %>%
summarise(increase_sum = sum(increase))
id increase_sum
<dbl> <dbl>
1 2 5.4
2 3 9.25
To then merge it and make the subtraction:
left_join(transactions,increase_df,by = "id") %>%
mutate(decrease = ifelse(!is.na(increase_sum),decrease - increase_sum,decrease))
id day sent_to received_from reference decrease increase reimbursed_id increase_sum
<dbl> <chr> <chr> <chr> <chr> <dbl> <dbl> <chr> <dbl>
1 1 day1 NA ATM add_cash NA 50 NA NA
2 2 day2 Garden Cinema NA cinema_tickets 5.4 NA R 5.4
3 3 day3 Pasta House NA meal 3.25 NA R 9.25
4 4 day4 NA Sarah gift NA 30 NA NA
5 5 day5 Blue Superstore NA shopping 15.2 NA NA NA
6 6 day6 NA Jane reimbursed NA 5.4 2 NA
7 7 day7 NA Joe reimbursed NA 7.25 3 NA
8 8 day8 NA Emily reimbursed NA 2 3 NA
I have example data as follows:
dat <- structure(list(
zipcode = c(1001, 1002, 1003, 1004, 1101, 1102, 1103, 1104, 1201, 1202, 1203, 1302),
areacode = c(4, 4, NA, 4, 4, 4, NA, 1, 4, 4, NA, 4),
type = structure(c(1L, 1L, NA, 1L, 2L, 2L, NA, 1L, 1L, 1L, NA, 1L),
.Label = c("clay", "sand"), class = "factor"),
region = c(3, 3, NA, 3, 3, 3, NA, 3, 3, 3, NA, 3),
do_not_fill = c(1, NA, NA, 1, 1, NA, NA, 1, NA, NA, NA, 1)),
class = c("data.table", "data.frame"), row.names = c(NA, -4L))
zipcode areacode type region do_not_fill
1: 1001 4 clay 3 1
2: 1002 4 clay 3 NA
3: 1003 NA <NA> NA NA
4: 1004 4 clay 3 1
5: 1101 4 sand 3 1
6: 1102 4 sand 3 NA
7: 1103 NA <NA> NA NA
8: 1104 1 clay 3 1
9: 1201 4 clay 3 NA
10: 1202 4 clay 3 NA
11: 1203 NA <NA> NA NA
12: 1302 4 clay 3 1
I want to fill ONLY the columns areacode, type and region based on two conditions.
The areacode has to be the same before and after the NA.
The first two digits of the zipcode have to be the same before and after the NA.
Based on this solution, and this solution, I attempted following (however data.table solutions are welcomed and even preferred):
library(dplyr)
dat |>
mutate(type = as.character(type)) |>
mutate(across(1:4,
~ ifelse(is.na(.) & lag(areacode) == lead(areacode) &
lag(as.numeric(substr(zipcode, 1, 2))) == lead(as.numeric(substr(zipcode, 1, 2))),
lag(.),
.)))
But somewhere I am doing something wrong, because I get:
Error:
! `n` and `row.names` must be consistent.
Run `rlang::last_error()` to see where the error occurred.
Desired output:
zipcode areacode type region do_not_fill
1: 1001 4 clay 3 1
2: 1002 4 clay 3 NA
3: 1003 4 clay 3 NA
4: 1004 4 clay 3 1
5: 1101 4 sand 3 1
6: 1102 4 sand 3 NA
7: 1103 NA <NA> NA NA
8: 1104 1 clay 3 1
9: 1201 4 clay 3 NA
10: 1202 4 clay 3 NA
11: 1203 NA <NA> NA NA
12: 1302 4 clay 3 1
EDIT
as_tibble(dat) |>
mutate(type = as.character(areacode)) |>
mutate(across(1:4,
~ ifelse(is.na(.) & lag(areacode) == lead(areacode) &
lag(as.numeric(substr(zipcode, 1, 2))) == lead(as.numeric(substr(zipcode, 1, 2))),
lag(.),
.)))
# A tibble: 12 x 5
zipcode areacode type region do_not_fill
<dbl> <dbl> <chr> <dbl> <dbl>
1 1001 4 4 3 1
2 1002 4 4 3 NA
3 1003 4 4 3 NA
4 1004 4 4 3 1
5 1101 4 4 3 1
6 1102 4 4 3 NA
7 1103 NA NA NA NA
8 1104 1 1 3 1
9 1201 4 4 3 NA
10 1202 4 4 3 NA
11 1203 NA NA NA NA
12 1302 4 4 3 1
You need to convert it to a tibble first. I think this is because data.table has extra attributes
Have a look at the rownames,
rownames(as_tibble(dat))
[1] "1" "2" "3" "4" "5" "6" "7" "8" "9" "10" "11" "12"
rownames(dat)
[1] "1" "2" "3" "4"
as_tibble(dat) |>
mutate(type = as.character(type)) |>
mutate(across(1:4,
~ ifelse(is.na(.) & lag(areacode) == lead(areacode) &
lag(as.numeric(substr(zipcode, 1, 2))) == lead(as.numeric(substr(zipcode, 1, 2))),
lag(.),
.)))
# A tibble: 12 x 5
zipcode areacode type region do_not_fill
<dbl> <dbl> <chr> <dbl> <dbl>
1 1001 4 clay 3 1
2 1002 4 clay 3 NA
3 1003 4 clay 3 NA
4 1004 4 clay 3 1
5 1101 4 sand 3 1
6 1102 4 sand 3 NA
7 1103 NA NA NA NA
8 1104 1 clay 3 1
9 1201 4 clay 3 NA
10 1202 4 clay 3 NA
11 1203 NA NA NA NA
12 1302 4 clay 3 1
This can be done in data.table using the same code:
dat[, c(lapply(.SD, \(v) {fifelse(
is.na(areacode) & lag(areacode) == lead(areacode) &
lag(as.numeric(substr(zipcode, 1, 2))) == lead(as.numeric(substr(zipcode, 1, 2))), lag(v), v)}),
.SD[, .(do_not_fill)]), .SDcols = !patterns("do_not_fill")]
zipcode areacode type region do_not_fill
<num> <num> <fctr> <num> <num>
1: 1001 4 clay 3 1
2: 1002 4 clay 3 NA
3: 1004 4 clay 3 NA
4: 1004 4 clay 3 1
5: 1101 4 sand 3 1
6: 1102 4 sand 3 NA
7: 1103 NA <NA> NA NA
8: 1104 1 clay 3 1
9: 1201 4 clay 3 NA
10: 1202 4 clay 3 NA
11: 1203 NA <NA> NA NA
12: 1302 4 clay 3 1
I have a large data.table in the format below
Name Value 1 2 3 4 5
A 58 1 NA NA NA NA
B 47 NA 1 NA NA NA
C 89 NA NA 1 NA NA
D 68 NA NA NA 1 NA
E 75 NA NA NA NA 1
I would like to forward rows of the data table to achieve below results. I know how to forward fill columns.
Name Value 1 2 3 4 5
A 58 1 1 1 1 1
B 47 NA 1 1 1 1
C 89 NA NA 1 1 1
D 68 NA NA NA 1 1
E 75 NA NA NA NA 1
Help!
data.table has it's own nafill function.
library(data.table) #v>=1.12.8
library(magrittr)
melt(dt, id = 1:2) %>%
.[, value := nafill(value, "locf"), by = Name] %>%
dcast(., ... ~ variable)
# Name Value 1 2 3 4 5
# 1: A 58 1 1 1 1 1
# 2: B 47 NA 1 1 1 1
# 3: C 89 NA NA 1 1 1
# 4: D 68 NA NA NA 1 1
# 5: E 75 NA NA NA NA 1
Data
dt <- fread("Name Value 1 2 3 4 5
A 58 1 NA NA NA NA
B 47 NA 1 NA NA NA
C 89 NA NA 1 NA NA
D 68 NA NA NA 1 NA
E 75 NA NA NA NA 1")
Use fill in tidyr to fill in missing values with previous value.
library(dplyr)
library(tidyr)
df %>%
pivot_longer(3:7) %>%
group_by(Name) %>%
fill(value) %>%
ungroup() %>%
pivot_wider()
# # A tibble: 5 x 7
# Name Value `1` `2` `3` `4` `5`
# <fct> <int> <int> <int> <int> <int> <int>
# 1 A 58 1 1 1 1 1
# 2 B 47 NA 1 1 1 1
# 3 C 89 NA NA 1 1 1
# 4 D 68 NA NA NA 1 1
# 5 E 75 NA NA NA NA 1
Note: The output above is the same as
df %>% fill(3:7, .direction = "up")
but the logic is different. The former belongs to "filling rows forward" and the latter is "filling columns backward". They will differ in other cases.
Data
df <- structure(list(Name = structure(1:5, .Label = c("A", "B", "C",
"D", "E"), class = "factor"), Value = c(58L, 47L, 89L, 68L, 75L
), `1` = c(1L, NA, NA, NA, NA), `2` = c(NA, 1L, NA, NA, NA),
`3` = c(NA, NA, 1L, NA, NA), `4` = c(NA, NA, NA, 1L, NA),
`5` = c(NA, NA, NA, NA, 1L)), class = "data.frame", row.names = c(NA, -5L))
I have a survey where people rank some schools. The survey results in multiple columns with 1s and 2s. I need to colapse these into a persons first and second choice.
Here is an exampe of the data that I have.
df1 <- tibble(Person = c(1 , 2 , 3 , 4 , 5 , 6 , 7),
School1 = c(NA, 1 , 2 , NA, NA, NA, 1 ),
School2 = c(NA, 2 , 1 , NA, NA, 1 , NA),
School3 = c(1 , NA, NA, NA, NA, 2 , NA),
School4 = c(2 , NA, NA, 1 , 2 , NA, NA),
School5 = c(NA, NA, NA, 2 , 1 , NA, 2))
Person School1 School2 School3 School4 School5
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 NA NA 1 2 NA
2 2 1 2 NA NA NA
3 3 2 1 NA NA NA
4 4 NA NA NA 1 2
5 5 NA NA NA 2 1
6 6 NA 1 2 NA NA
7 7 1 NA NA NA 2
Here is the result that I need.
df2 <- tibble(Person = c(1 , 2 , 3 , 4 , 5 , 6 , 7),
School1 = c(NA, 1 , 2 , NA, NA, NA, 1 ),
School2 = c(NA, 2 , 1 , NA, NA, 1 , NA),
School3 = c(1 , NA, NA, NA, NA, 2 , NA),
School4 = c(2 , NA, NA, 1 , 2 , NA, NA),
School5 = c(NA, NA, NA, 2 , 1 , NA, 2),
Firstchoice = c('School3', 'School1', 'School2', 'School4', 'School5', 'School2', 'School1'),
Secondchoice = c('School4', 'School2', 'School1', 'School5', 'School4', 'School3', 'School5'))
Person School1 School2 School3 School4 School5 Firstchoice Secondchoice
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <chr>
1 1 NA NA 1 2 NA School3 School4
2 2 1 2 NA NA NA School1 School2
3 3 2 1 NA NA NA School2 School1
4 4 NA NA NA 1 2 School4 School5
5 5 NA NA NA 2 1 School5 School4
6 6 NA 1 2 NA NA School2 School3
7 7 1 NA NA NA 2 School1 School5
I have looked at mutate, and using a for loop however I can't figure out how to get them to work since they would need to do inline updates of a column.
Any help would be appreciated.
One tidyverse possibility could be:
df1 %>%
gather(var, val, -Person) %>%
mutate(val = ifelse(val == 1, "Firstchoice",
ifelse(val == 2, "Secondchoice", NA_character_))) %>%
na.omit() %>%
spread(val, var) %>%
left_join(df1, by = c("Person" = "Person"))
Person Firstchoice Secondchoice School1 School2 School3 School4 School5
<dbl> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 School3 School4 NA NA 1 2 NA
2 2 School1 School2 1 2 NA NA NA
3 3 School2 School1 2 1 NA NA NA
4 4 School4 School5 NA NA NA 1 2
5 5 School5 School4 NA NA NA 2 1
6 6 School2 School3 NA 1 2 NA NA
7 7 School1 School5 1 NA NA NA 2
Piece of cake with tidyr:
choices <- gather(df1, key = "school", value = "choice", -Person, na.rm=TRUE)
choices <- arrange(choices, Person, choice)
I have some data that looks like this:
samp
# A tibble: 5 x 2
ID Source
<dbl> <chr>
1 34221 75
2 33861 75
3 59741 126,123
4 56561 111,105
5 55836 36,34,34,36,22
Of any of the distinct values, I want to make a new column. If the value exists in a row I want to impute an "x" otherwise no value should be imputed.
Example (pseudo code) of the expected result:
ID 75 126 123 111 105 36 34 22
1 34221 x
2 33861 x
3 59741 x x
4 56561 x x
5 55836 x x x
I tried it by the separtate function of the tydr package. Like this for the start.
into = unique(unlist(strsplit(samp$Source, ",")))
samp %>% separate(col = "Source", into = into, sep = ",")
However, this doesn´t work, because if there are more then one value in a row the values will not be assigned to the respective column (e.g. for the ID 59741 the value 126 is in column 75 and not in the column 126).
A tibble: 5 x 9
ID `75` `126` `123` `111` `105` `36` `34` `22`
<dbl> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
1 34221 75 NA NA NA NA NA NA NA
2 33861 75 NA NA NA NA NA NA NA
3 59741 126 123 NA NA NA NA NA NA
4 56561 111 105 NA NA NA NA NA NA
5 55836 36 34 34 36 22 NA NA NA
Here is a dput:
structure(list(ID = c(34221, 33861, 59741, 56561, 55836), Source = c("75",
"75", "126,123", "111,105", "36,34,34,36,22")), row.names = c(NA,
-5L), class = c("tbl_df", "tbl", "data.frame"))
Could also do:
library(tidyverse)
df %>%
mutate(Source = strsplit(Source, ","),
dummy = "x") %>%
unnest() %>% distinct() %>%
spread(Source, dummy)
Output:
ID `105` `111` `123` `126` `22` `34` `36` `75`
<dbl> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
1 33861 NA NA NA NA NA NA NA x
2 34221 NA NA NA NA NA NA NA x
3 55836 NA NA NA NA x x x NA
4 56561 x x NA NA NA NA NA NA
5 59741 NA NA x x NA NA NA NA
The package splitstackshape is very handy for such operations, i.e.
library(splitstackshape)
cSplit_e(df, "Source", mode = "binary", type = "character", fill = 0, drop = TRUE)
which gives,
ID Source_105 Source_111 Source_123 Source_126 Source_22 Source_34 Source_36 Source_75
1 34221 0 0 0 0 0 0 0 1
2 33861 0 0 0 0 0 0 0 1
3 59741 0 0 1 1 0 0 0 0
4 56561 1 1 0 0 0 0 0 0
5 55836 0 0 0 0 1 1 1 0
Another option is using tidyr::separate_rows
library(dplyr)
library(tidyr)
df %>% separate_rows(Source,sep=',') %>% distinct() %>%
mutate(dummy='X') %>% spread(Source,dummy)
ID 105 111 123 126 22 34 36 75
1 33861 <NA> <NA> <NA> <NA> <NA> <NA> <NA> X
2 34221 <NA> <NA> <NA> <NA> <NA> <NA> <NA> X
3 55836 <NA> <NA> <NA> <NA> X X X <NA>
4 56561 X X <NA> <NA> <NA> <NA> <NA> <NA>
5 59741 <NA> <NA> X X <NA> <NA> <NA> <NA>