Using dplyr to fill in missing values (through a join?) - r

I have a data frame (df1) that has some missing values (city, state):
SiteID City StateBasedIn Lat Lon Var1 Var2
4227 Richmond KY -39 -113 6 0
4987 Nashville TN -33 -97 7 0
4000 Newark NJ -39 -95 8 0
4925 Miami FL -40 -99 0 0
4437 Montgomery AL -32 -117 4 1
4053 Jonesboro AR -30 -98 8 1
df1 <- structure(list(SiteID = c(4227L, 4987L, 4000L, 4925L, 4437L,
4053L, 4482L, 4037L, 4020L, 1787L, 2805L, 3025L, 3027L, 3028L,
3029L, 3030L, 3031L, 3033L), City = structure(c(10L, 7L, 8L,
5L, 6L, 4L, 2L, 9L, 3L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("",
"Arcata", "Jackson", "Jonesboro", "Miami", "Montgomery", "Nashville",
"Newark", "Portland", "Richmond"), class = "factor"), StateBasedIn = structure(c(6L,
10L, 8L, 5L, 2L, 3L, 4L, 9L, 7L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L), .Label = c("", "AL", "AR", "CA", "FL", "KY", "MS", "NJ",
"OR", "TN"), class = "factor"), Lat = c(-39L, -33L, -39L, -40L,
-32L, -30L, -38L, -31L, -35L, -38L, -30L, -39L, -38L, -32L, -39L,
-31L, -38L, -34L), Lon = c(-113L, -97L, -95L, -99L, -117L, -98L,
-98L, -95L, -112L, -120L, -114L, -81L, -117L, -90L, -109L, -115L,
-81L, -104L), Var1 = c(6L, 7L, 8L, 0L, 4L, 8L, 1L, 8L, 0L, 3L,
3L, 7L, 4L, 8L, 0L, 8L, 1L, 3L), Var2 = c(0L, 0L, 0L, 0L, 1L,
1L, 1L, 1L, 1L, 0L, 1L, 1L, 0L, 0L, 1L, 0L, 1L, 0L)), .Names = c("SiteID",
"City", "StateBasedIn", "Lat", "Lon", "Var1", "Var2"), class = "data.frame", row.names = c(NA,
-18L))
And I would like to fill those values in by merging with another data frame (df2) that has 3 of the same columns, but not all of the columns that are in df1:
SiteID City StateBasedIn
1787 Lusby MD
2805 Springdale AR
3025 Saukville WI
3027 Saukville WI
3028 Saukville WI
3029 Saukville WI
df2 <- structure(list(SiteID = c(1787L, 2805L, 3025L, 3027L, 3028L,
3029L, 3030L, 3031L, 3033L), City = structure(c("Lusby", "Springdale",
"Saukville", "Saukville", "Saukville", "Saukville", "Saukville",
"Mequon", "Mequon"), .Dim = c(9L, 1L)), StateBasedIn = structure(c("MD",
"AR", "WI", "WI", "WI", "WI", "WI", "WI", "WI"), .Dim = c(9L,
1L))), row.names = c(NA, -9L), class = "data.frame", .Names = c("SiteID",
"City", "StateBasedIn"))
So basically I would retain all of the information in df1, and input the missing values that are available from df2. As I'm not too familiar with all of the dplyr options yet, I tried the different 'join' options but had no luck. I also tried to use 'merge' in the base package but still no success. Is there another way to do this (preferably with dplyr)?

You can use a full_join from dplyr, along with replace and coalesce to put together a pretty concise solution.
library(dplyr)
library(purrr)
# Cleaning from r2evans (if you want to keep it to dplyr just use r2evans lapply method
df1 <- mutate_if(df1, is.factor, as.character)
df2 <- dmap(df2, as.vector)
full_join(df1, df2, by = "SiteID") %>%
mutate_at(vars(matches("City","StateBased")), funs(replace(., . == "", NA))) %>%
mutate(City = coalesce(City.y, City.x),
StateBasedIn = coalesce(StateBasedIn.y, StateBasedIn.x)) %>%
select(-contains("."))

This solution is not very stylish, but at least it is a solution.
library(dplyr)
library(magrittr)
aux <- df1 %>%
# filter missing values
filter(City == "") %>%
# delete City and StateBasedIn so that the columns
# are not duplicates after the join
select(-c(City, StateBasedIn)) %>%
# inner join with the second dataframe
inner_join(df2, by = "SiteID") %>%
# change order of the columns
select(SiteID, City, StateBasedIn, Lat, Lon, Var1, Var2)
df1 %<>%
# filter all rows which values are not missing
filter(City != "") %>%
# bind the auxiliary dataframe
rbind(aux)
Results in:
SiteID City StateBasedIn Lat Lon Var1 Var2
1 4227 Richmond KY -39 -113 6 0
2 4987 Nashville TN -33 -97 7 0
3 4000 Newark NJ -39 -95 8 0
4 4925 Miami FL -40 -99 0 0
5 4437 Montgomery AL -32 -117 4 1
6 4053 Jonesboro AR -30 -98 8 1
7 4482 Arcata CA -38 -98 1 1
8 4037 Portland OR -31 -95 8 1
9 4020 Jackson MS -35 -112 0 1
10 1787 Lusby MD -38 -120 3 0
11 2805 Springdale AR -30 -114 3 1
12 3025 Saukville WI -39 -81 7 1
13 3027 Saukville WI -38 -117 4 0
14 3028 Saukville WI -32 -90 8 0
15 3029 Saukville WI -39 -109 0 1
16 3030 Saukville WI -31 -115 8 0
17 3031 Mequon WI -38 -81 1 1
18 3033 Mequon WI -34 -104 3 0

Slightly simplified version of Felix's answer.
First, repairing the data by changing factor to character, and removing the apparent matrices from the second one:
str(df1)
# 'data.frame': 18 obs. of 7 variables:
# $ SiteID : int 4227 4987 4000 4925 4437 4053 4482 4037 4020 1787 ...
# $ City : Factor w/ 10 levels "","Arcata","Jackson",..: 10 7 8 5 6 4 2 9 3 1 ...
# $ StateBasedIn: Factor w/ 10 levels "","AL","AR","CA",..: 6 10 8 5 2 3 4 9 7 1 ...
# $ Lat : int -39 -33 -39 -40 -32 -30 -38 -31 -35 -38 ...
# $ Lon : int -113 -97 -95 -99 -117 -98 -98 -95 -112 -120 ...
# $ Var1 : int 6 7 8 0 4 8 1 8 0 3 ...
# $ Var2 : int 0 0 0 0 1 1 1 1 1 0 ...
str(df2)
# 'data.frame': 9 obs. of 3 variables:
# $ SiteID : int 1787 2805 3025 3027 3028 3029 3030 3031 3033
# $ City : chr [1:9, 1] "Lusby" "Springdale" "Saukville" "Saukville" ...
# $ StateBasedIn: chr [1:9, 1] "MD" "AR" "WI" "WI" ...
df1 <- mutate_if(df1, is.factor, as.character)
df2[] <- lapply(df2, as.vector)
Now the work:
library(dplyr)
df1 %>%
left_join(select(df2, SiteID, cty = City, st = StateBasedIn), by = "SiteID") %>%
mutate(
City = ifelse(nzchar(City), City, cty),
StateBasedIn = ifelse(grepl("[^\\s]", StateBasedIn), StateBasedIn, st)
) %>%
select(-cty, -st)
# SiteID City StateBasedIn Lat Lon Var1 Var2
# 1 4227 Richmond KY -39 -113 6 0
# 2 4987 Nashville TN -33 -97 7 0
# 3 4000 Newark NJ -39 -95 8 0
# 4 4925 Miami FL -40 -99 0 0
# 5 4437 Montgomery AL -32 -117 4 1
# 6 4053 Jonesboro AR -30 -98 8 1
# 7 4482 Arcata CA -38 -98 1 1
# 8 4037 Portland OR -31 -95 8 1
# 9 4020 Jackson MS -35 -112 0 1
# 10 1787 Lusby MD -38 -120 3 0
# 11 2805 Springdale AR -30 -114 3 1
# 12 3025 Saukville WI -39 -81 7 1
# 13 3027 Saukville WI -38 -117 4 0
# 14 3028 Saukville WI -32 -90 8 0
# 15 3029 Saukville WI -39 -109 0 1
# 16 3030 Saukville WI -31 -115 8 0
# 17 3031 Mequon WI -38 -81 1 1
# 18 3033 Mequon WI -34 -104 3 0
I included two different ways to check for empty fields, uncertain if your example was conveniently clean in that regard; you can use either nzchar (empty vs non-empty) or the grepl("[^\\s]",...) solution (some non-whitespace present) easily. (Some data might also need is.na in the check ...)

Related

Determine the total value of a frequency table

I have a dataset in which the number of pixels is counted for each value looking like this:
HISTO_2 HISTO_3 HISTO_4 HISTO_5 HISTO_6 HISTO_7 HISTO_10 HISTO_11 HISTO_14 HISTO_18 HISTO_19 HISTO_23
1 0 390 652 157 32 7 0 0 0 0 0 0
2 0 22 41 27 23 11 8 5 4 11 2 4
3 0 916 671 167 40 7 4 5 2 1 2 2
4 0 2600 810 172 38 0 0 0 0 0 0 0
5 0 110 987 791 248 59 11 5 0 1 0 0
6 0 778 808 182 43 5 0 0 0 0 0 0
7 0 1095 846 199 55 12 8 3 0 0 0 0
8 0 1045 545 60 0 0 0 0 0 0 0 0
9 0 868 422 92 2 0 0 0 0 0 0 0
10 0 1225 597 160 57 27 0 0 0 0 0 0
11 0 1092 1096 635 150 33 0 0 0 0 0 0
HISTO_2 caputres the number of pixels with value 2, HISTO_3 caputres the number of pixels with value 3 and so on. I need to find a way that enables me to efficiently count the total value of all pixels per row. Precisely, each value of the column HISTO_3 must be multiplied by three, and each value of the column HISTO_$4 must be multiplied by 4 and so on... before rowise sums can be calculated. This has to be done for ten datasets. As can be seen in the table, the value of the columns doesn't follow a balanced sequence and for each dataset, the sequence can be different.
Any efficient solutions to my problem?
PS: If you come up better title for my question, feel free to edit :)
Since you are interested in the TOTAL VALUE of the row pixels, you could do:
Base R:
colSums(t(df) * as.numeric(gsub('\\D', '',names(df))))
1 2 3 4 5 6 7 8 9 10 11
4804 1099 6781 12128 10317 6769 8191 5615 4764 7394 11966
or even:
as.matrix(df)%*%as.numeric(gsub('\\D', '',names(df)))
[,1]
1 4804
2 1099
3 6781
4 12128
5 10317
6 6769
7 8191
8 5615
9 4764
10 7394
11 11966
if you are unfamiliar with gsub and \\D ie regex, then use
library(readr)
as.matrix(df) %*% parse_number(names(df))
[,1]
1 4804
2 1099
3 6781
4 12128
5 10317
6 6769
7 8191
8 5615
9 4764
10 7394
11 11966
With dplyr and tidyr (for unnest_wider)
EDIT: First extracting the values for multiplication from the variable names, then unnesting the list to enable multiplication by across block.
library(dplyr)
library(tidyr)
df %>%
mutate(mult = list(as.numeric(sub(".*_(\\d+)$", "\\1", colnames(.))))) %>%
unnest_wider(mult, names_sep="_") %>%
summarize(across(starts_with("HIST")) * across(starts_with("mult"))) %>%
rowwise() %>%
mutate(total = sum(c_across(everything()))) %>%
ungroup()
# A tibble: 11 × 13
HISTO_2 HISTO_3 HISTO_4 HISTO_5 HISTO_6 HISTO_7 HISTO_10 HISTO_11 HISTO_14
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 0 1170 2608 785 192 49 0 0 0
2 0 66 164 135 138 77 80 55 56
3 0 2748 2684 835 240 49 40 55 28
4 0 7800 3240 860 228 0 0 0 0
5 0 330 3948 3955 1488 413 110 55 0
6 0 2334 3232 910 258 35 0 0 0
7 0 3285 3384 995 330 84 80 33 0
8 0 3135 2180 300 0 0 0 0 0
9 0 2604 1688 460 12 0 0 0 0
10 0 3675 2388 800 342 189 0 0 0
11 0 3276 4384 3175 900 231 0 0 0
HISTO_18 HISTO_19 HISTO_23 total
<dbl> <dbl> <dbl> <dbl>
1 0 0 0 4804
2 198 38 92 1099
3 18 38 46 6781
4 0 0 0 12128
5 18 0 0 10317
6 0 0 0 6769
7 0 0 0 8191
8 0 0 0 5615
9 0 0 0 4764
10 0 0 0 7394
11 0 0 0 11966
For all 10 data sets
df_list <- list(df1, df2, df3, df4, df5, df6, df7, df8, df9, df10)
lapply(df_list ,function(x)
x %>%
mutate(mult = list(as.numeric(sub(".*_(\\d+)$", "\\1", colnames(.))))) %>%
unnest_wider(mult, names_sep="_") %>%
summarize(across(starts_with("HIST")) * across(starts_with("mult"))) %>%
rowwise() %>%
mutate(total = sum(c_across(everything()))) %>%
ungroup())
Data
df <- structure(list(HISTO_2 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L), HISTO_3 = c(390L, 22L, 916L, 2600L, 110L, 778L, 1095L,
1045L, 868L, 1225L, 1092L), HISTO_4 = c(652L, 41L, 671L, 810L,
987L, 808L, 846L, 545L, 422L, 597L, 1096L), HISTO_5 = c(157L,
27L, 167L, 172L, 791L, 182L, 199L, 60L, 92L, 160L, 635L), HISTO_6 = c(32L,
23L, 40L, 38L, 248L, 43L, 55L, 0L, 2L, 57L, 150L), HISTO_7 = c(7L,
11L, 7L, 0L, 59L, 5L, 12L, 0L, 0L, 27L, 33L), HISTO_10 = c(0L,
8L, 4L, 0L, 11L, 0L, 8L, 0L, 0L, 0L, 0L), HISTO_11 = c(0L, 5L,
5L, 0L, 5L, 0L, 3L, 0L, 0L, 0L, 0L), HISTO_14 = c(0L, 4L, 2L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), HISTO_18 = c(0L, 11L, 1L, 0L,
1L, 0L, 0L, 0L, 0L, 0L, 0L), HISTO_19 = c(0L, 2L, 2L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L), HISTO_23 = c(0L, 4L, 2L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L)), class = "data.frame", row.names = c("1",
"2", "3", "4", "5", "6", "7", "8", "9", "10", "11"))

Dividing non-equal dataframes based on a group condition

I have two dataframes, with a similar strucure:
df_I <- structure(list(year = c("2006", "2006", "2006", "2006", "2006",
"2006", "2006", "2006", "2006"), code = c(0, 1110,
1120, 1130, 1220, 1230, 1310, 1320, 1330), `1` = c(1L,
8L, 2L, 2L, 0L, 2L, 0L, 1L, 0L), `2` = c(0L, 10L, 0L, 0L,
0L, 2L, 1L, 3L, 1L), `3` = c(4L, 2L, 1L, 2L, 0L, 4L,
0L, 0L, 3L), `4` = c(4L, 6L, 0L, 3L, 1L, 3L, 0L, 0L, 3L),
totaal = c(11, 26, 3, 7, 1, 9, 7, 7, 6)), row.names = c(NA,
-9L), class = c("tbl_df", "tbl", "data.frame"))
# A tibble: 9 × 7
year code `1` `2` `3` `4` totaal
<chr> <dbl> <int> <int> <int> <int> <dbl>
1 2006 0 1 0 4 4 11
2 2006 1110 8 10 2 6 26
3 2006 1120 2 0 1 0 3
4 2006 1130 2 0 2 3 7
5 2006 1220 0 0 0 1 1
6 2006 1230 2 2 4 3 9
7 2006 1310 0 1 0 0 7
8 2006 1320 1 3 0 0 7
9 2006 1330 0 1 3 3 6
df_II <- structure(list(year = c("2006", "2006", "2006", "2006", "2006",
"2006", "2006", "2006", "2006", "2006"), code = c(0, 1110,
1120, 1130, 1210, 1220, 1230, 1310, 1320, 1330), `1` = c(15806L,
655L, 105L, 328L, 138L, 452L, 445L, 471L, 672L, 615L), `2` = c(9681L,
337L, 68L, 215L, 97L, 357L, 366L, 245L, 440L, 360L), `3` = c(10457L,
221L, 40L, 123L, 65L, 325L, 322L, 151L, 352L, 332L), `4` = c(7109L,
128L, 5L, 64L, 56L, 256L, 240L, 83L, 274L, 192L), totaal = c(43053,
1341, 218, 730, 356, 1390, 1373, 950, 1738, 1499)), row.names = c(NA,
-10L), class = c("tbl_df", "tbl", "data.frame"))
# A tibble: 10 × 7
year code `1` `2` `3` `4` totaal
<chr> <dbl> <int> <int> <int> <int> <dbl>
1 2006 0 15806 9681 10457 7109 43053
2 2006 1110 655 337 221 128 1341
3 2006 1120 105 68 40 5 218
4 2006 1130 328 215 123 64 730
5 2006 1210 138 97 65 56 356
6 2006 1220 452 357 325 256 1390
7 2006 1230 445 366 322 240 1373
8 2006 1310 471 245 151 83 950
9 2006 1320 672 440 352 274 1738
10 2006 1330 615 360 332 192 1499
I would like to create a new data.frame df_out, which divides df_I by df_II, for columns 1,2,3,4, totaal by year and code. The issue is that not every code is available for each year.
What is the best way to divide this unequal dataframe?
Desired outcome:
# A tibble: 10 × 7
year code `1` `2` `3` `4` totaal
<chr> <dbl> <int> <int> <int> <int> <dbl>
1 2006 0 1 /15806 0/9681 4/10457 4/7109 11/43053
You could subset the second data frame using %in%, assuming both code columns are properly ordered.
cols <- as.character(1:4)
cbind(df_I[setdiff(names(df_I), cols)], df_I[cols] / subset(df_II, code %in% df_I$code, cols))
# year code totaal 1 2 3 4
# 1 2006 0 11 6.326711e-05 0.000000000 0.0003825189 0.000562667
# 2 2006 1110 26 1.221374e-02 0.029673591 0.0090497738 0.046875000
# 3 2006 1120 3 1.904762e-02 0.000000000 0.0250000000 0.000000000
# 4 2006 1130 7 6.097561e-03 0.000000000 0.0162601626 0.046875000
# 5 2006 1220 1 0.000000e+00 0.000000000 0.0000000000 0.003906250
# 6 2006 1230 9 4.494382e-03 0.005464481 0.0124223602 0.012500000
# 7 2006 1310 7 0.000000e+00 0.004081633 0.0000000000 0.000000000
# 8 2006 1320 7 1.488095e-03 0.006818182 0.0000000000 0.000000000
# 9 2006 1330 6 0.000000e+00 0.002777778 0.0090361446 0.015625000
You could use complete to make the number of rows between the two data frames equal, and then do the division:
library(tidyr)
df_I %<>%
complete(code = df_II$code) %>%
fill(year) %>%
replace(is.na(.), 0)
cbind(df_I[c(1, 2)], df_I[-c(1, 2)] / df_II[-c(1, 2)])
code year `1` `2` `3` `4` totaal
<dbl> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
1 0 2006 0.0000633 0 0.000383 0.000563 0.000255
2 1110 2006 0.0122 0.0297 0.00905 0.0469 0.0194
3 1120 2006 0.0190 0 0.025 0 0.0138
4 1130 2006 0.00610 0 0.0163 0.0469 0.00959
5 1210 2006 0 0 0 0 0
6 1220 2006 0 0 0 0.00391 0.000719
7 1230 2006 0.00449 0.00546 0.0124 0.0125 0.00655
8 1310 2006 0 0.00408 0 0 0.00737
9 1320 2006 0.00149 0.00682 0 0 0.00403
10 1330 2006 0 0.00278 0.00904 0.0156 0.00400

How to sum variables by groups in R

I have a data frame named "SpatialKey" with three columns. First column contains 5 categories representing population quintile. The second column has 4 kind of data: 0, 400, 800 and 1200. The third column represents population.
For example
quintile
isocrona
total
4
1200
1674
1
400
1676
4
400
1723
5
800
1567
3
0
1531
3
1200
1370
2
1200
1925
1
400
1916
5
0
1776
2
800
1896
3
800
2143
5
400
2098
4
400
1496
1
0
961
4
800
1684
I want to clasify the data by quintile and sum the population by the 4 kind of data I have in the second column. For example:
0
400
800
1200
1
961
3592
0
0
2
0
0
1896
1925
3
1531
0
2143
1370
4
0
3219
1684
1674
5
1776
2098
1567
0
And here is my code.
po <- SpatialKey %>%
group_by(quintile, isocrona) %>%
summarise_at(vars(contains("total")), sum)
final_df <- as.data.frame(t(po))
But R give me the following table:
V1
V2
V3
V4
V5
V6
V7
V8
V9
V10
V11
V12
V13
V14
V15
V16
V17
V18
V19
V20
quintile
1
1
1
1
2
2
2
2
3
3
3
3
4
4
4
4
5
5
5
5
isocrona
0
400
800
1200
0
400
800
1200
0
400
800
1200
0
400
800
1200
0
400
800
1200
total
961
3592
0
0
0
0
1896
1925
1531
0
2143
1370
0
3219
1684
1674
1776
2098
1567
0
How would I do the second table in R?
Use xtabs. Put the variable to be summed on the left hand side of the formula and the others on the right hand side. We can use dot to mean all the rest. No packages are used.
xtabs(total ~., SpatialKey)
giving this xtabs table:
isocrona
quintile 0 400 800 1200
1 961 3592 0 0
2 0 0 1896 1925
3 1531 0 2143 1370
4 0 3219 1684 1674
5 1776 2098 1567 0
Note
The input in reproducible form is:
SpatialKey <- structure(list(quintile = c(4L, 1L, 4L, 5L, 3L, 3L, 2L, 1L, 5L,
2L, 3L, 5L, 4L, 1L, 4L), isocrona = c(1200L, 400L, 400L, 800L,
0L, 1200L, 1200L, 400L, 0L, 800L, 800L, 400L, 400L, 0L, 800L),
total = c(1674L, 1676L, 1723L, 1567L, 1531L, 1370L, 1925L,
1916L, 1776L, 1896L, 2143L, 2098L, 1496L, 961L, 1684L)),
class = "data.frame", row.names = c(NA, -15L))
Here we need a pivot_wider to reshape into 'wide' format while doing the sum
library(dplyr)
library(tidyr)
SpatialKey %>%
arrange(quintile, isocrona) %>%
pivot_wider(names_from = isocrona, values_from = total,
values_fn = sum, values_fill = 0)
-output
# A tibble: 5 x 5
# quintile `0` `400` `800` `1200`
# <int> <int> <int> <int> <int>
#1 1 961 3592 0 0
#2 2 0 0 1896 1925
#3 3 1531 0 2143 1370
#4 4 0 3219 1684 1674
#5 5 1776 2098 1567 0
Or use xtabs from base R
xtabs(total ~ quintile + isocrona, SpatialKey)
data
SpatialKey <- structure(list(quintile = c(4L, 1L, 4L, 5L, 3L, 3L, 2L, 1L, 5L,
2L, 3L, 5L, 4L, 1L, 4L), isocrona = c(1200L, 400L, 400L, 800L,
0L, 1200L, 1200L, 400L, 0L, 800L, 800L, 400L, 400L, 0L, 800L),
total = c(1674L, 1676L, 1723L, 1567L, 1531L, 1370L, 1925L,
1916L, 1776L, 1896L, 2143L, 2098L, 1496L, 961L, 1684L)),
class = "data.frame", row.names = c(NA,
-15L))
A method based on the idea of group. The benefit is the result is still the dataframe format.
Result of long format:
library(data.table)
dt.long <- setDT(SpatialKey)[,sum(total),keyby = .(quintile,isocrona)]
dt.long
quintile isocrona V1
1: 1 0 961
2: 1 400 3592
3: 2 800 1896
4: 2 1200 1925
5: 3 0 1531
6: 3 800 2143
7: 3 1200 1370
8: 4 400 3219
9: 4 800 1684
10: 4 1200 1674
11: 5 0 1776
12: 5 400 2098
13: 5 800 1567
Result of wide format:
dcast(dt.long,quintile ~ isocrona,fill = 0,value.var = "V1")
quintile 0 400 800 1200
1: 1 961 3592 0 0
2: 2 0 0 1896 1925
3: 3 1531 0 2143 1370
4: 4 0 3219 1684 1674
5: 5 1776 2098 1567 0
Data:
SpatialKey <- structure(list(quintile = c(4L, 1L, 4L, 5L, 3L, 3L, 2L, 1L, 5L,
2L, 3L, 5L, 4L, 1L, 4L), isocrona = c(1200L, 400L, 400L, 800L,
0L, 1200L, 1200L, 400L, 0L, 800L, 800L, 400L, 400L, 0L, 800L),
total = c(1674L, 1676L, 1723L, 1567L, 1531L, 1370L, 1925L,
1916L, 1776L, 1896L, 2143L, 2098L, 1496L, 961L, 1684L)),
class = "data.frame", row.names = c(NA,
-15L))

How to combine dataframes in R based-on similar timelines for multiple attributes and then transforming the data to make these columns as row headers?

I am trying to merge my sales data and patients data in R (and some other attributes) which are rolled-up at the country level for the same time-frame. After merging, I want to consolidate it to a long format instead of wide format and keep it unique at the Country-Month level.
This is how my input data looks like -
1) Sales Data
Coutry_ID Country_Name 1/28/2018 2/28/2018 3/28/2018 4/28/2018 5/28/2018
A0001 USA 44 72 85 25 72
A0002 Germany 98 70 69 48 41
A0003 Russia 82 42 32 29 43
A0004 UK 79 83 51 48 47
A0005 France 45 75 10 13 23
A0006 India 92 85 28 13 18
2) Patients Data
Coutry_ID Country_Name 1/28/2018 2/28/2018 3/28/2018 4/28/2018 5/28/2018
A0001 USA 7 13 22 23 13
A0002 Germany 9 10 17 25 25
A0003 Russia 24 19 6 8 5
A0004 UK 6 8 20 1 11
A0005 France 4 9 8 10 25
A0006 India 18 21 2 13 17
AND this is how I intend output to look like -
Coutry_ID Country_Name Month Sales Patients
A0001 USA 1/28/2018 44 7
A0001 USA 2/28/2018 72 13
A0001 USA 3/28/2018 85 22
A0001 USA 4/28/2018 25 23
A0001 USA 5/28/2018 72 13
A0002 Germany 1/28/2018 98 9
A0002 Germany 2/28/2018 70 10
A0002 Germany 3/28/2018 69 17
A0002 Germany 4/28/2018 48 25
A0002 Germany 5/28/2018 41 25
A0003 Russia 1/28/2018 82 24
A0003 Russia 2/28/2018 42 19
A0003 Russia 3/28/2018 32 6
A0003 Russia 4/28/2018 29 8
A0003 Russia 5/28/2018 43 5
A0004 UK 1/28/2018 79 6
A0004 UK 2/28/2018 83 8
A0004 UK 3/28/2018 51 20
A0004 UK 4/28/2018 48 1
A0004 UK 5/28/2018 47 11
A0005 France 1/28/2018 45 4
A0005 France 2/28/2018 75 9
A0005 France 3/28/2018 10 8
A0005 France 4/28/2018 13 10
A0005 France 5/28/2018 23 25
A0006 India 1/28/2018 92 18
A0006 India 2/28/2018 85 21
A0006 India 3/28/2018 28 2
A0006 India 4/28/2018 13 13
A0006 India 5/28/2018 18 17
I need a little guidance on these 2 things -
1 - How to convert the data from wide to long?
2 - For merging data, I am thinking about using DPLYR left_join on all these data-sets with my master list of countries with ID and Name. My doubt is whether I should first convert the data sets into The long format from wide or do that after merging?
You can get both the dataframes in long format and then join :
library(dplyr)
library(tidyr)
inner_join(
sales %>% pivot_longer(cols = -c(Coutry_ID, Country_Name), values_to = 'Sales'),
patients %>% pivot_longer(cols = -c(Coutry_ID, Country_Name),
values_to = 'Patients'),
by = c("Coutry_ID", "Country_Name", "name"))
# A tibble: 30 x 5
# Coutry_ID Country_Name name Sales Patients
# <fct> <fct> <chr> <int> <int>
# 1 A0001 USA 1/28/2018 44 7
# 2 A0001 USA 2/28/2018 72 13
# 3 A0001 USA 3/28/2018 85 22
# 4 A0001 USA 4/28/2018 25 23
# 5 A0001 USA 5/28/2018 72 13
# 6 A0002 Germany 1/28/2018 98 9
# 7 A0002 Germany 2/28/2018 70 10
# 8 A0002 Germany 3/28/2018 69 17
# 9 A0002 Germany 4/28/2018 48 25
#10 A0002 Germany 5/28/2018 41 25
# … with 20 more rows
data
sales <- structure(list(Coutry_ID = structure(1:6, .Label = c("A0001",
"A0002", "A0003", "A0004", "A0005", "A0006"), class = "factor"),
Country_Name = structure(c(6L, 2L, 4L, 5L, 1L, 3L), .Label = c("France",
"Germany", "India", "Russia", "UK", "USA"), class = "factor"),
`1/28/2018` = c(44L, 98L, 82L, 79L, 45L, 92L), `2/28/2018` = c(72L,
70L, 42L, 83L, 75L, 85L), `3/28/2018` = c(85L, 69L, 32L,
51L, 10L, 28L), `4/28/2018` = c(25L, 48L, 29L, 48L, 13L,
13L), `5/28/2018` = c(72L, 41L, 43L, 47L, 23L, 18L)), class =
"data.frame", row.names = c(NA, -6L))
patients <- structure(list(Coutry_ID = structure(1:6, .Label = c("A0001",
"A0002", "A0003", "A0004", "A0005", "A0006"), class = "factor"),
Country_Name = structure(c(6L, 2L, 4L, 5L, 1L, 3L), .Label = c("France",
"Germany", "India", "Russia", "UK", "USA"), class = "factor"),
`1/28/2018` = c(7L, 9L, 24L, 6L, 4L, 18L), `2/28/2018` = c(13L,
10L, 19L, 8L, 9L, 21L), `3/28/2018` = c(22L, 17L, 6L, 20L,
8L, 2L), `4/28/2018` = c(23L, 25L, 8L, 1L, 10L, 13L), `5/28/2018` = c(13L,
25L, 5L, 11L, 25L, 17L)), class = "data.frame", row.names = c(NA, -6L))
Base R (not as eloquent as above):
# Create a named list of dataframes:
df_list <- list(patients = patients, sales = sales)
# Create a vector in each with the name of the dataframe:
df_list <- mapply(cbind, df_list, "desc" = as.character(names(df_list)),
SIMPLIFY = FALSE)
# Define a function to reshape the data:
reshape_ps <- function(x){
tmp <- setNames(reshape(x,
direction = "long",
varying = which(names(x) %in% names(x[,sapply(x, is.numeric)])),
idvar = c(!(names(x) %in% names(x[,sapply(x, is.numeric)]))),
v.names = "month",
times = as.Date(names(x[,sapply(x, is.numeric)]), "%m/%d/%Y"),
new.row.names = 1:(nrow(x)*length(which(names(x) %in% names(x[,sapply(x, is.numeric)]))))),
c(names(x[!(names(x) %in% names(x[,sapply(x, is.numeric)]))]), "month", as.character(unique(x$desc))))
# Drop the dataframe name vector:
clean <- tmp[,names(tmp) != "desc"]
# Specify the return object:
return(clean)
}
# Merge the result of the function applied on both dataframes:
Reduce(function(y, z){merge(y, z, by = intersect(colnames(y), colnames(z)), all = TRUE)},
Map(function(x){reshape_ps(x)}, df_list))

R programming - data frame manoevur

Suppose I have the following dataframe:
dc tmin tmax cint wcmin wcmax wsmin wsmax gsmin gsmax wd rmin rmax cir lr
1: 24 -1 4 5 -5 -2 20 25 35 40 90 11.8 26.6 14.8 3
2: 41 -3 5 8 -8 -3 15 20 35 40 90 10.0 23.5 13.5 3
3: 48 0 5 5 -4 0 30 35 45 50 45 7.3 19.0 11.7 6
4: 50 0 5 5 -4 0 30 35 45 50 45 7.3 19.0 11.7 6
5: 52 3 5 2 -3 1 20 25 35 40 45 6.7 17.4 10.7 6
6: 57 -2 5 7 -6 -1 25 30 35 40 315 4.4 13.8 9.4 7
lc wc li yd yr nF factdcx
1: 1 3 TRUE 1 2010 2 24
2: 1 3 TRUE 1 2010 8 41
3: 2 3 TRUE 1 2010 0 48
4: 2 3 TRUE 1 2010 0 50
5: 2 3 TRUE 1 2010 0 52
6: 3 3 FALSE 1 2010 0 57
I'd like to turn it into a new dataframe like the following:
dc tmin tmax cint wcmin wcmax wsmin wsmax gsmin gsmax wd rmin rmax cir lr
1: 24 -1 4 5 -5 -2 20 25 35 40 90 11.8 26.6 14.8 3
2: 41 -3 5 8 -8 -3 15 20 35 40 90 10.0 23.5 13.5 3
3: 48 0 5 5 -4 0 30 35 45 50 45 7.3 19.0 11.7 6
4: 52 3 5 2 -3 1 20 25 35 40 45 6.7 17.4 10.7 6
5: 57 -2 5 7 -6 -1 25 30 35 40 315 4.4 13.8 9.4 7
lc wc li yd yr nF factdcx
1: 1 3 TRUE 1 2010 2 24
2: 1 3 TRUE 1 2010 8 41
3: 2 3 TRUE 1 2010 0 (sum of nF for 48 and 50, factdcx) 48
4: 2 3 TRUE 1 2010 0 52
5: 3 3 FALSE 1 2010 0 57
How can I do it? (Surely, the dataframe, abc, is much larger, but I want the sum of all categories of 48 and 50 and group it into a new category, say '48').
Many thanks!
> dput(head(abc1))
structure(list(dc = c(24L, 41L, 48L, 50L, 52L, 57L), tmin = c(-1L,
-3L, 0L, 0L, 3L, -2L), tmax = c(4L, 5L, 5L, 5L, 5L, 5L), cint = c(5L,
8L, 5L, 5L, 2L, 7L), wcmin = c(-5L, -8L, -4L, -4L, -3L, -6L),
wcmax = c(-2L, -3L, 0L, 0L, 1L, -1L), wsmin = c(20L, 15L,
30L, 30L, 20L, 25L), wsmax = c(25L, 20L, 35L, 35L, 25L, 30L
), gsmin = c(35L, 35L, 45L, 45L, 35L, 35L), gsmax = c(40L,
40L, 50L, 50L, 40L, 40L), wd = c(90L, 90L, 45L, 45L, 45L,
315L), rmin = c(11.8, 10, 7.3, 7.3, 6.7, 4.4), rmax = c(26.6,
23.5, 19, 19, 17.4, 13.8), cir = c(14.8, 13.5, 11.7, 11.7,
10.7, 9.4), lr = c(3L, 3L, 6L, 6L, 6L, 7L), lc = c(1L, 1L,
2L, 2L, 2L, 3L), wc = c(3L, 3L, 3L, 3L, 3L, 3L), li = c(TRUE,
TRUE, TRUE, TRUE, TRUE, FALSE), yd = c(1L, 1L, 1L, 1L, 1L,
1L), yr = c(2010L, 2010L, 2010L, 2010L, 2010L, 2010L), nF = c(2L,
8L, 0L, 0L, 0L, 0L), factdcx = structure(1:6, .Label = c("24",
"41", "48", "50", "52", "57", "70"), class = "factor")), .Names = c("dc",
"tmin", "tmax", "cint", "wcmin", "wcmax", "wsmin", "wsmax", "gsmin",
"gsmax", "wd", "rmin", "rmax", "cir", "lr", "lc", "wc", "li",
"yd", "yr", "nF", "factdcx"), class = c("data.table", "data.frame"
), row.names = c(NA, -6L), .internal.selfref = <pointer: 0x054b24a0>)
Still got a problem, sir/madam:
> head(abc1 (updated))
dc tmin tmax cint wcmin wcmax wsmin wsmax gsmin gsmax wd rmin rmax cir lr
1: 24 -1 4 5 -5 -2 20 25 35 40 90 11.8 26.6 14.8 3
2: 41 -3 5 8 -8 -3 15 20 35 40 90 10.0 23.5 13.5 3
3: 48 0 5 5 -4 0 30 35 45 50 45 7.3 19.0 11.7 6
4: 52 3 5 2 -3 1 20 25 35 40 45 6.7 17.4 10.7 6
5: 57 -2 5 7 -6 -1 25 30 35 40 315 4.4 13.8 9.4 7
6: 70 -2 3 5 -4 -1 20 25 30 35 360 3.6 10.2 6.6 7
lc wc li yd yr nF factdcx
1: 1 3 TRUE 1 2010 2 24
2: 1 3 TRUE 1 2010 8 41
3: 2 3 TRUE 1 2010 57 48
4: 2 3 TRUE 1 2010 0 52
5: 3 3 FALSE 1 2010 0 57
6: 3 2 TRUE 1 2010 1 70
The sum of nF was incorrect, it should be zero.
Try
library(data.table)
unique(setDT(df1)[, factdcx:= as.character(factdcx)][factdcx %chin%
c('48','50'), c('dc', 'factdcx', 'nF') := list('48', '48', sum(nF))])
# dc tmin tmax cint wcmin wcmax wsmin wsmax gsmin gsmax wd rmin rmax cir lr
#1: 24 -1 4 5 -5 -2 20 25 35 40 90 11.8 26.6 14.8 3
#2: 41 -3 5 8 -8 -3 15 20 35 40 90 10.0 23.5 13.5 3
#3: 48 0 5 5 -4 0 30 35 45 50 45 7.3 19.0 11.7 6
#4: 52 3 5 2 -3 1 20 25 35 40 45 6.7 17.4 10.7 6
#5: 57 -2 5 7 -6 -1 25 30 35 40 315 4.4 13.8 9.4 7
# lc wc li yd yr nF factdcx
#1: 1 3 TRUE 1 2010 2 24
#2: 1 3 TRUE 1 2010 8 41
#3: 2 3 TRUE 1 2010 0 48
#4: 2 3 TRUE 1 2010 0 52
#5: 3 3 FALSE 1 2010 0 57
For abc1,
res1 <- unique(setDT(abc1)[, factdcx:= as.character(factdcx)][factdcx %chin%
c('48','50'), c('dc', 'factdcx', 'nF') := list(48, '48', sum(nF))])
res1
# dc tmin tmax cint wcmin wcmax wsmin wsmax gsmin gsmax wd rmin rmax cir lr
#1: 24 -1 4 5 -5 -2 20 25 35 40 90 11.8 26.6 14.8 3
#2: 41 -3 5 8 -8 -3 15 20 35 40 90 10.0 23.5 13.5 3
#3: 48 0 5 5 -4 0 30 35 45 50 45 7.3 19.0 11.7 6
#4: 52 3 5 2 -3 1 20 25 35 40 45 6.7 17.4 10.7 6
#5: 57 -2 5 7 -6 -1 25 30 35 40 315 4.4 13.8 9.4 7
# lc wc li yd yr nF factdcx
#1: 1 3 TRUE 1 2010 2 24
#2: 1 3 TRUE 1 2010 8 41
#3: 2 3 TRUE 1 2010 0 48
#4: 2 3 TRUE 1 2010 0 52
#5: 3 3 FALSE 1 2010 0 57
data
df1 <- structure(list(dc = structure(1:6, .Label = c("24", "41",
"48",
"50", "52", "57"), class = "factor"), tmin = c(-1L, -3L, 0L,
0L, 3L, -2L), tmax = c(4L, 5L, 5L, 5L, 5L, 5L), cint = c(5L,
8L, 5L, 5L, 2L, 7L), wcmin = c(-5L, -8L, -4L, -4L, -3L, -6L),
wcmax = c(-2L, -3L, 0L, 0L, 1L, -1L), wsmin = c(20L, 15L,
30L, 30L, 20L, 25L), wsmax = c(25L, 20L, 35L, 35L, 25L, 30L
), gsmin = c(35L, 35L, 45L, 45L, 35L, 35L), gsmax = c(40L,
40L, 50L, 50L, 40L, 40L), wd = c(90L, 90L, 45L, 45L, 45L,
315L), rmin = c(11.8, 10, 7.3, 7.3, 6.7, 4.4), rmax = c(26.6,
23.5, 19, 19, 17.4, 13.8), cir = c(14.8, 13.5, 11.7, 11.7,
10.7, 9.4), lr = c(3L, 3L, 6L, 6L, 6L, 7L), lc = c(1L, 1L,
2L, 2L, 2L, 3L), wc = c(3L, 3L, 3L, 3L, 3L, 3L), li = c(TRUE,
TRUE, TRUE, TRUE, TRUE, FALSE), yd = c(1L, 1L, 1L, 1L, 1L,
1L), yr = c(2010L, 2010L, 2010L, 2010L, 2010L, 2010L), nF = c(2L,
8L, 0L, 0L, 0L, 0L), factdcx = structure(1:6, .Label = c("24",
"41", "48", "50", "52", "57"), class = "factor")), .Names = c("dc",
"tmin", "tmax", "cint", "wcmin", "wcmax", "wsmin", "wsmax", "gsmin",
"gsmax", "wd", "rmin", "rmax", "cir", "lr", "lc", "wc", "li",
"yd", "yr", "nF", "factdcx"), row.names = c("1:", "2:", "3:",
"4:", "5:", "6:"), class = "data.frame")

Resources