How to melt a multiple columns df with R? - r
I have this data for which I would like to transform it to long.
library(tidyverse)
library(data.table)
#>
#> Attaching package: 'data.table'
#> The following objects are masked from 'package:dplyr':
#>
#> between, first, last
#> The following object is masked from 'package:purrr':
#>
#> transpose
df <- structure(list(
julian_days = c(
127, 130, 132, 134, 137, 139,
141, 144, 148, 151, 153, 155, 158, 160, 162, 165, 167, 169, 172,
NA, NA, NA, NA, NA, NA, NA, NA, NA
), sea_ice_algae_last_cm = c(
0.636,
0.698, 0.666666666666667, 0.685384615384615, 0.713, 0.6375, 0.58375,
0.637272727272727, 0.6575, 0.691666666666667, 0.629166666666667,
0.637142857142857, 0.589166666666667, 0.56, 0.571818181818182,
0.492, 0.31, 0.312, 0.203076923076923, NA, NA, NA, NA, NA, NA,
NA, NA, NA
), sd = c(
0.0227058484879019, 0.0369684550213647, 0.0533853912601565,
0.0525381424324881, 0.0413790070231539, 0.0381682876458741, 0.0277788888666675,
0.0410099766132362, 0.0222076972732838, 0.0194079021706795, 0.0299873710792131,
0.0363841933236059, 0.0253908835942542, 0.055746679790749, 0.0604678727620178,
0.0294957624075053, 0.10770329614269, 0.0657267069006199, 0.0693282789084673,
NA, NA, NA, NA, NA, NA, NA, NA, NA
), julian_days_2 = c(
127, 130,
132, 134, 137, 139, 141, 144, 146, 148, 151, 153, 155, 158, 160,
162, 165, 167, 169, 172, 174, 176, 179, 181, 183, 186, 188, 190
), water_1_5_m_depth = c(
0.69, 0.5475, 0.596, 0.512, 0.598, 0.488333333333333,
0.27, 0.41, 0.568, 0.503333333333333, 0.668333333333333, 0.71,
0.636666666666667, 0.623333333333333, 0.66, 0.541666666666667,
0.57, 0.545, 0.501666666666667, 0.526666666666667, 0.566666666666667,
0.493333333333333, 0.59, 0.518333333333333, 0.443333333333333,
0.605, 0.58, 0.478333333333333
), sd_2 = c(
0.121655250605964,
0.0718215380880506, 0.0736885337077625, 0.0376828873628335, 0.084380092438916,
0.0636919670497516, 0.054037024344425, 0.0540370243444251, 0.0370135110466435,
0.0571547606649408, 0.0702614166286638, 0.0442718872423573, 0.0799166232186176,
0.0480277697448743, 0.0409878030638384, 0.0462240918425302, 0.0920869154657709,
0.0706399320497981, 0.0511533641774093, 0.100531918646103, 0.0186189867250252,
0.0588784057755188, 0.0841427358718512, 0.0934701378337842, 0.0492612085384298,
0.0653452370108182, 0.0878635305459549, 0.0851860708488579
),
water_10_m_depth = c(
0.66, 0.732, 0.595, 0.712, 0.514, 0.48,
0.35, 0.44, 0.535, 0.403333333333333, 0.728, 0.746, 0.625,
0.698333333333333, 0.705, 0.555, 0.585, 0.651666666666667,
0.603333333333333, 0.595, 0.615, 0.615, 0.658333333333333,
0.641666666666667, 0.623333333333333, 0.628333333333333,
0.661666666666667, 0.631666666666667
), sd_3 = c(
0, 0.0342052627529742,
0.0387298334620742, 0.0327108544675923, 0.0610737259384104,
0.0700000000000001, 0.127279220613579, 0.0972111104761177,
0.0564800849857717, 0.0504645089807343, 0.0540370243444252,
0.0415932686861709, 0.0809320702811933, 0.0475043857624395,
0.0398748040747538, 0.0568330889535313, 0.0388587184554509,
0.0204124145231932, 0.058878405775519, 0.0896102672688791,
0.0535723809439155, 0.0488876262463212, 0.043089055068157,
0.0306050104830347, 0.0527888877195444, 0.0708284312029193,
0.0426223728418147, 0.0348807492274272
), julian_days_3 = c(
134,
137, 139, 141, 146, 148, 153, 155, 160, 162, 165, 169, 172,
174, 176, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA
), water_40_m_depth = c(
0.523166666666667, 0.360833333333333,
0.279, 0.228, 0.551166666666667, 0.358666666666667, 0.593,
0.6225, 0.6665, 0.5468, 0.334714285714286, 0.654, 0.567666666666667,
0.664166666666667, 0.6345, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA
), sd_4 = c(
0.0793937445058905, 0.0346145441493408,
0.0834625664594612, 0.105740247777277, 0.0437008771841786,
0.0810719844747042, 0.0849529281425892, 0.0539620236833275,
0.0689514321823702, 0.0344992753547085, 0.0889713704621029,
0.064221491729794, 0.0166933120340652, 0.0545982295195244,
0.0578472125516865, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA
), julian_days_4 = c(
181, 183, 186, 188, 190, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA
), water_60_m_depth = c(
0.617833333333333,
0.492333333333333, 0.642166666666667, 0.7265, 0.686166666666667,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA
), sd_5 = c(
0.0574818812032684,
0.049766119666563, 0.0704540039079871, 0.0286618212959331,
0.0382225936674458, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA
)
), row.names = c(
NA,
-28L
), class = c("tbl_df", "tbl", "data.frame"))
arrange(df, desc(julian_days_4)) # Look at the data at day 190
#> # A tibble: 28 x 14
#> julian_days sea_ice_algae_l… sd julian_days_2 water_1_5_m_dep…
#> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 137 0.713 0.0414 137 0.598
#> 2 134 0.685 0.0525 134 0.512
#> 3 132 0.667 0.0534 132 0.596
#> 4 130 0.698 0.0370 130 0.548
#> 5 127 0.636 0.0227 127 0.69
#> 6 139 0.638 0.0382 139 0.488
#> 7 141 0.584 0.0278 141 0.27
#> 8 144 0.637 0.0410 144 0.41
#> 9 148 0.658 0.0222 146 0.568
#> 10 151 0.692 0.0194 148 0.503
#> # … with 18 more rows, and 9 more variables: sd_2 <dbl>,
#> # water_10_m_depth <dbl>, sd_3 <dbl>, julian_days_3 <dbl>,
#> # water_40_m_depth <dbl>, sd_4 <dbl>, julian_days_4 <dbl>,
#> # water_60_m_depth <dbl>, sd_5 <dbl>
I would like to “stack” all this into 3 columns:
julian with all columns starting with “julian”
measure with all columns starting with “water” or “sea”
sd with all columns starting with “sd”
Note that in the “water” columns, the numbers represent the depth (ex.: water_1_5_m_depth means 1.5 m).
The desired output for the first line would be something like:
tibble(
julian = c(127, 127, 127, 134, 181),
type = c("sea", "water_1.5", "water_10", "water_40", "water_60"),
measure = c(0.64, 0.69, 0.66, 0.52, 0.62),
sd = c(0.02, 0.12, 0, 0.08, 0.06)
)
#> # A tibble: 5 x 4
#> julian type measure sd
#> <dbl> <chr> <dbl> <dbl>
#> 1 127 sea 0.64 0.02
#> 2 127 water_1.5 0.69 0.12
#> 3 127 water_10 0.66 0
#> 4 134 water_40 0.52 0.08
#> 5 181 water_60 0.62 0.06
My attempt so far was with data.table.
melt(
setDT(df),
measure = patterns("^julian", "^sea", "^water_1_5", "^water_10", "^water_40", "^water_60", "^sd"),
value.name = c("julian", "sea", "water_1.5", "water_10", "water_40", "water_60", "sd")
)
#> variable julian sea water_1.5 water_10 water_40 water_60
#> 1: 1 127 0.6360000 0.6900 0.660 0.5231667 0.6178333
#> 2: 1 130 0.6980000 0.5475 0.732 0.3608333 0.4923333
#> 3: 1 132 0.6666667 0.5960 0.595 0.2790000 0.6421667
#> 4: 1 134 0.6853846 0.5120 0.712 0.2280000 0.7265000
#> 5: 1 137 0.7130000 0.5980 0.514 0.5511667 0.6861667
#> ---
#> 136: 5 NA NA NA NA NA NA
#> 137: 5 NA NA NA NA NA NA
#> 138: 5 NA NA NA NA NA NA
#> 139: 5 NA NA NA NA NA NA
#> 140: 5 NA NA NA NA NA NA
#> sd
#> 1: 0.02270585
#> 2: 0.03696846
#> 3: 0.05338539
#> 4: 0.05253814
#> 5: 0.04137901
#> ---
#> 136: NA
#> 137: NA
#> 138: NA
#> 139: NA
#> 140: NA
Any help appreciated.
UPDATE:
Here is the file I received.
Created on 2019-04-12 by the reprex package (v0.2.1)
library(tidyverse)
list_of_dfs <- split.default(df, rep(1:4, c(3, 5, 3, 3)))
list_of_dfs[[5]] <- list_of_dfs[[2]][, c(1, 4, 5)]
list_of_dfs[[2]] <- list_of_dfs[[2]][, 1:3]
list_of_dfs %>%
map(~ .[complete.cases(.), ]) %>%
map(~ mutate(., type = grep("^sea|^water", names(.), value = TRUE))) %>%
map(setNames, nm = c("julian", "measure", "sd", "type")) %>%
bind_rows()
# # A tibble: 95 x 4
# julian measure sd type
# <dbl> <dbl> <dbl> <chr>
# 1 127 0.636 0.0227 sea_ice_algae_last_cm
# 2 130 0.698 0.0370 sea_ice_algae_last_cm
# 3 132 0.667 0.0534 sea_ice_algae_last_cm
# 4 134 0.685 0.0525 sea_ice_algae_last_cm
# 5 137 0.713 0.0414 sea_ice_algae_last_cm
# 6 139 0.638 0.0382 sea_ice_algae_last_cm
# 7 141 0.584 0.0278 sea_ice_algae_last_cm
# 8 144 0.637 0.0410 sea_ice_algae_last_cm
# 9 148 0.658 0.0222 sea_ice_algae_last_cm
# 10 151 0.692 0.0194 sea_ice_algae_last_cm
# # … with 85 more rows
It would be nice if you share your desired output. I think this is what you want:
df %>%
select(starts_with("julian")) %>%
gather(key = col, julian) %>%
bind_cols(df %>%
select(starts_with("water")) %>%
gather(col_water, measure)) %>%
#bind_cols(df %>%
# select(starts_with("sea")) %>%
# gather(col_sea, measure2)) %>%
bind_cols(df %>%
select(starts_with("sd")) %>%
gather(col_sd, sd)) %>%
select(julian, measure, sd)
julian measure sd
<dbl> <dbl> <dbl>
1 127 0.69 0.122
2 130 0.548 0.0718
3 132 0.596 0.0737
4 134 0.512 0.0377
5 137 0.598 0.0844
6 139 0.488 0.0637
7 141 0.27 0.0540
8 144 0.41 0.0540
9 148 0.568 0.0370
10 151 0.503 0.0572
# ... with 102 more rows
In this try i did not include the variables starting with sea, sice it would lead to a one to many merge. Let me know if I am in the right direction to include that one.
data.table::melt(
df,
measure.vars = patterns("^julian"),
variable.name = "julian_variable",
value.name = "julian_value"
) %>%
data.table::melt(
measure.vars = patterns(measure = "^sea|^water"),
variable.name = "measure_variable",
value.name = "measure_value"
) %>%
data.table::melt(
measure.vars = patterns(measure = "^sd"),
variable.name = "sd_variable",
value.name = "sd_value"
)
# julian_variable julian_value measure_variable measure_value sd_variable sd_value
# 1: julian_days 127 sea_ice_algae_last_cm 0.6360000 sd 0.02270585
# 2: julian_days 130 sea_ice_algae_last_cm 0.6980000 sd 0.03696846
# 3: julian_days 132 sea_ice_algae_last_cm 0.6666667 sd 0.05338539
# 4: julian_days 134 sea_ice_algae_last_cm 0.6853846 sd 0.05253814
# 5: julian_days 137 sea_ice_algae_last_cm 0.7130000 sd 0.04137901
# ---
# 2796: julian_days_4 NA water_60_m_depth NA sd_5 NA
# 2797: julian_days_4 NA water_60_m_depth NA sd_5 NA
# 2798: julian_days_4 NA water_60_m_depth NA sd_5 NA
# 2799: julian_days_4 NA water_60_m_depth NA sd_5 NA
# 2800: julian_days_4 NA water_60_m_depth NA sd_5 NA
Though it is unclear as to what the desired output is. This solution obviously leads to a lot of duplication (basically, each individual value is duplicated 100 times! 4 "julian" columns * 5 "measure" columns * 5 "sd" columns).
Related
Specify which column(s) a specific date appears in R
I have a subset of my data in a dataframe (dput codeblock below) containing dates in which a storm occurred ("Date_AR"). I'd like to know if a storm occurred in the north, south or both, by determining whether the same date occurred in the "Date_N" and/or "Date_S" column/s. For example, the first date is Jan 17, 1989 in the "Date_AR" column. In the location column, I would like "S" to be printed, since this date is found in the "Date_S" column. If Apr 5. 1989 occurs in "Date_N" and "Date_S", the I would like a "B" (for both) to be printed in the location column. Thanks in advance for the help! Apologies if this type of question is already out there. I may not know the keywords to search. structure(list(Date_S = structure(c(6956, 6957, 6970, 7008, 7034, 7035, 7036, 7172, 7223, 7224, 7233, 7247, 7253, 7254, 7255, 7262, 7263, 7266, 7275, 7276), class = "Date"), Date_N = structure(c(6968, 6969, 7035, 7049, 7103, 7172, 7221, 7223, 7230, 7246, 7247, 7251, 7252, 7253, 7262, 7266, 7275, 7276, 7277, 7280), class = "Date"), Date_AR = structure(c(6956, 6957, 6968, 6969, 6970, 7008, 7034, 7035, 7036, 7049, 7103, 7172, 7221, 7223, 7224, 7230, 7233, 7246, 7247, 7251), class = "Date"), Precip = c(23.6, 15.4, 3, 16.8, 0.2, 3.6, 22, 13.4, 0, 30.8, 4.6, 27.1, 0, 19, 2.8, 11.4, 2, 57.6, 9.4, 39), Location = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA)), row.names = c(NA, 20L), class = "data.frame")
Using dplyr::case_when you could do: library(dplyr) dat |> mutate(Location = case_when( Date_AR %in% Date_S & Date_AR %in% Date_N ~ "B", Date_AR %in% Date_S ~ "S", Date_AR %in% Date_N ~ "N" )) #> Date_S Date_N Date_AR Precip Location #> 1 1989-01-17 1989-01-29 1989-01-17 23.6 S #> 2 1989-01-18 1989-01-30 1989-01-18 15.4 S #> 3 1989-01-31 1989-04-06 1989-01-29 3.0 N #> 4 1989-03-10 1989-04-20 1989-01-30 16.8 N #> 5 1989-04-05 1989-06-13 1989-01-31 0.2 S #> 6 1989-04-06 1989-08-21 1989-03-10 3.6 S #> 7 1989-04-07 1989-10-09 1989-04-05 22.0 S #> 8 1989-08-21 1989-10-11 1989-04-06 13.4 B #> 9 1989-10-11 1989-10-18 1989-04-07 0.0 S #> 10 1989-10-12 1989-11-03 1989-04-20 30.8 N #> 11 1989-10-21 1989-11-04 1989-06-13 4.6 N #> 12 1989-11-04 1989-11-08 1989-08-21 27.1 B #> 13 1989-11-10 1989-11-09 1989-10-09 0.0 N #> 14 1989-11-11 1989-11-10 1989-10-11 19.0 B #> 15 1989-11-12 1989-11-19 1989-10-12 2.8 S #> 16 1989-11-19 1989-11-23 1989-10-18 11.4 N #> 17 1989-11-20 1989-12-02 1989-10-21 2.0 S #> 18 1989-11-23 1989-12-03 1989-11-03 57.6 N #> 19 1989-12-02 1989-12-04 1989-11-04 9.4 B #> 20 1989-12-03 1989-12-07 1989-11-08 39.0 N
How to use gplot_na_imputations() or ggplot_na_distribution() from the package imputeTS
I have a dataframe (table with 100 rows/countries and 28 columns/months between 2020 and 2022). I used the package imputeTS and used the function na_kalman() to substitute my several NAs values by some estimated values. Everything goes fine till here. After, when I try to plot using gplot_na_imputations() or ggplot_na_distribution() an error is shown: "Input x_with_na is not numeric". I think the solution is to convert my dataframe into a time series 'ts'. Any suggestions? This is what I have: total_tests_imp <- na_kalman(total_tests_md) ggplot_na_imputations(x_with_na = total_tests_md, x_with_imputations = total_tests_imp) ggplot_na_distribution(total_tests_md) (ps.) when I run: class(total_tests_md) it appears:[1] "tbl_df" "tbl" "data.frame" When I run `head(total_tests_md)´ # A tibble: 6 x 29 countries jan_20 fev_20 mar_20 abr_20 mai_20 jun_20 jul_20 ago_20 set_20 out_20 nov_20 dez_20 jan_21 fev_21 mar_21 abr_21 <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> 1 Afghanistan NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA 2 Albania NA 0.009 0.54 2.83 5.08 8.19 12.9 20.3 29.1 42.0 61.7 86.2 119. 155. 187. 214. 3 Algeria NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA 4 Andorra NA NA NA NA NA NA NA NA 691. 1033. 1405. 1613. 1819. 2003. 2175. 2335. 5 Angola NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA 6 Argentina 0.013 0.015 0.162 1.55 4.44 9.91 19.7 34.3 52.3 74.3 92.3 112. 143. 172. 204. 257. # ... with 12 more variables: mai_21 <dbl>, jun_21 <dbl>, jul_21 <dbl>, ago_21 <dbl>, set_21 <dbl>, out_21 <dbl>, # nov_21 <dbl>, dez_21 <dbl>, jan_22 <dbl>, fev_22 <dbl>, mar_22 <dbl>, abr_22 <dbl>´´´ dput(head(total_tests_md)) structure(list(countries = c("Afghanistan", "Albania", "Algeria", "Andorra", "Angola", "Argentina"), jan_20 = c(NA, NA, NA, NA, NA, 0.013), fev_20 = c(NA, 0.009, NA, NA, NA, 0.015), mar_20 = c(NA, 0.54, NA, NA, NA, 0.162), abr_20 = c(NA, 2.831, NA, NA, NA, 1.546 ), mai_20 = c(NA, 5.083, NA, NA, NA, 4.445), jun_20 = c(NA, 8.192, NA, NA, NA, 9.913), jul_20 = c(NA, 12.852, NA, NA, NA, 19.719 ), ago_20 = c(NA, 20.317, NA, NA, NA, 34.32), set_20 = c(NA, 29.089, NA, 691.095, NA, 52.255), out_20 = c(NA, 42.031, NA, 1033.495, NA, 74.307), nov_20 = c(NA, 61.658, NA, 1404.711, NA, 92.271), dez_20 = c(NA, 86.158, NA, 1613.414, NA, 112.404), jan_21 = c(NA, 119.428, NA, 1819.053, NA, 143.415), fev_21 = c(NA, 154.702, NA, 2003.284, NA, 171.576), mar_21 = c(NA, 186.772, NA, 2174.988, NA, 203.784), abr_21 = c(NA, 214.329, NA, 2335.148, NA, 257.398 ), mai_21 = c(NA, 243.676, NA, 2480.234, NA, 317.92), jun_21 = c(NA, 271.086, NA, 2543.915, NA, 375.2), jul_21 = c(NA, 299.727, NA, 2621.83, NA, 433.25), ago_21 = c(NA, 352.728, NA, 2709.918, NA, 492.053), set_21 = c(NA, 404.621, NA, 2767.717, NA, 528.764), out_21 = c(NA, 439.925, NA, 2850.247, NA, 556.29), nov_21 = c(NA, 467.614, NA, 3006.839, NA, 580.944), dez_21 = c(NA, 495.44, NA, 3449.208, NA, 627.339), jan_22 = c(21.413, 543.967, NA, 3840.758, 40.321, 730.777), fev_22 = c(22.328, 552.997, NA, 3882.243, 41.965, 756.948), mar_22 = c(22.695, 556.666, 5.167, NA, 43.944, 777.078), abr_22 = c(NA, 558.412, NA, NA, 44.198, 783.816)), row.names = c(NA, -6L), class = c("tbl_df", "tbl", "data.frame"))
When you use ggplot_na_imputations or ggplot_na_distribution, you should provide vector or ts object in one dimension as it is specified in the function description : https://www.rdocumentation.org/packages/imputeTS/versions/3.2/topics/ggplot_na_imputations So you must convert your data.frame with all countries into a vector by country. Moreover, to convert a vector to time series, see there : https://stat.ethz.ch/R-manual/R-devel/library/stats/html/ts.html Your data total_tests_md <- structure(list(countries = c("Afghanistan", "Albania", "Algeria", "Andorra", "Angola", "Argentina"), jan_20 = c(NA, NA, NA, NA, NA, 0.013), fev_20 = c(NA, 0.009, NA, NA, NA, 0.015), mar_20 = c(NA, 0.54, NA, NA, NA, 0.162), abr_20 = c(NA, 2.831, NA, 0.3, NA, 1.546)), row.names = c(NA, -6L), class = c("tbl_df", "tbl", "data.frame")) Import your libraries library(zoo) library(imputeTS) Convert your data.frame into a vector # remove country name Albania <- total_tests_md[2,-1] Albania <- as.numeric(Albania) # create month vector month <- seq(as.Date("2020-01-01"), as.Date("2020-04-01"), by = "month") When you use time series # reasonning with ts Albaniats <- zoo(Albania, month) AlbaniatsInput <- Albaniats AlbaniatsInput[1] <- 0.5 ggplot_na_imputations(x_with_na = Albaniats, x_with_imputations = AlbaniatsInput, x_axis_labels = index(Albaniats)) ggplot_na_distribution(Albaniats, x_axis_labels = index(Albaniats)) When you use only vector #reasoning with numeric vector AlbaniaInput <- Albania AlbaniaInput[1] <- 0.5 ggplot_na_imputations(x_with_na = Albania, x_with_imputations = AlbaniaInput, x_axis_labels = month) ggplot_na_distribution(Albania, x_axis_labels = month)
Replacing elements in a column of a dataframe by using regular expressions in R
df is a test dataframe and is a subset of my original dataframe which has ~1000000 rows and 21 columns. df <- data.frame( Hits = c("# test1", "# Query: tr|A4I9M8|A4I9M8_LEIIN", "# 13135", "tr|E9BQL4|E9BQL4_LEIDB", "tr|A4I9M8|A4I9M8_LEIIN", "tr|A0A3Q8IUE6|A0A3Q8IUE6_LEIDO", "tr|Q4Q3E9|Q4Q3E9_LEIMA", "tr|A0A640KX53|A0A640KX53_LEITA", "# test2", "# Query: tr|E9AH01|E9AH01_LEIIN", "# 83771", "tr|A0A6L0XNG2|A0A6L0XNG2_LEIIN", "tr|E9AH01|E9AH01_LEIIN", "tr|A0A6J8FCW4|A0A6J8FCW4_LEIDO", "tr|A0A6J8FCW4|A0A6J8FCW4_LEIDO"), Category1 = c(NA, NA, NA, 0.001, 0.001, 0.002, 0.003, 0.003, NA, NA, NA, 0.023, 0.341, 0.341, 0.569), Category2 = c(NA, NA, NA, 100, 100, 99, 98, 98, NA, NA, NA, 100, 95, 95, 97), Category3 = c(NA, NA, NA, 100, 100, 99, 98, 98, NA, NA, NA, 98, 97, 97, 92)) df looks something like this In the Hits column, the elements which don't start with a # are to be replaced by the portion lying between the first two occurrences of |. The regular expression which I came up with to extract this portion is ^.*?(\\b[A-Z][^|]*).* The output should look like this I can't seem to figure out how to replace the elements with the extracted portions. I can think of using conditional loops in this case. But considering the size of the original dataframe, I'm not sure if that would be an efficient way to deal with this as loops tend to be slower in R. Can somebody suggest an alternative way, preferably a vectorized solution to solve this issue?
You can use gsub() inside mutate() to do the job. library(tidyverse) # my original answer df %>% mutate(Hits = gsub("^[^#].+?((?<=\\|).+?(?=\\|)).*", "\\1", Hits, perl = T)) Or # OP's regex df %>% mutate(Hits = gsub("^[^#].*?(\\b[A-Z][^\\|]*).*", "\\1", Hits, perl = T)) Both generate the same output. Output # A tibble: 15 x 4 Hits Category1 Category2 Category3 <chr> <dbl> <dbl> <dbl> 1 # test1 NA NA NA 2 # Query: tr|A4I9M8|A4I9M8_LEIIN NA NA NA 3 # 13135 NA NA NA 4 E9BQL4 0.001 100 100 5 A4I9M8 0.001 100 100 6 A0A3Q8IUE6 0.002 99 99 7 Q4Q3E9 0.003 98 98 8 A0A640KX53 0.003 98 98 9 # test2 NA NA NA 10 # Query: tr|E9AH01|E9AH01_LEIIN NA NA NA 11 # 83771 NA NA NA 12 A0A6L0XNG2 0.023 100 98 13 E9AH01 0.341 95 97 14 A0A6J8FCW4 0.341 95 97 15 A0A6J8FCW4 0.569 97 92
Determine range of time where measurements are not NA
I have a dataset with hundreds of thousands of measurements taken from several subjects. However, the measurements are only partially available, i.e., there may be large stretches with NA. I need to establish up front, for which timespan positive data are available for each subject. Data: df timestamp C B A starttime_ms 1 00:00:00.033 NA NA NA 33 2 00:00:00.064 NA NA NA 64 3 00:00:00.066 NA 0.346 NA 66 4 00:00:00.080 47.876 0.346 22.231 80 5 00:00:00.097 47.876 0.346 22.231 97 6 00:00:00.099 47.876 0.346 NA 99 7 00:00:00.114 47.876 0.346 NA 114 8 00:00:00.130 47.876 0.346 NA 130 9 00:00:00.133 NA 0.346 NA 133 10 00:00:00.147 NA 0.346 NA 147 My (humble) solution so far is (i) to pick out the range of timestamp values that are not NA and to select the first and last such timestamp for each subject individually. Here's the code for subject C: NotNA_C <- df$timestamp[which(!is.na(df$C))] range_C <- paste(NotNA_C[1], NotNA_C[length(NotNA_C)], sep = " - ") range_C [1] "00:00:00.080" "00:00:00.130" That doesn't look elegant and, what's more, it needs to be repeated for all other subjects. Is there a more efficient way to establish the range of time for which non-NA values are available for all subjects in one go? EDIT I've found a base R solution: sapply(df[,2:4], function(x) paste(df$timestamp[which(!is.na(x))][1], df$timestamp[which(!is.na(x))][length(df$timestamp[which(!is.na(x))])], sep = " - ")) C B A "00:00:00.080 - 00:00:00.130" "00:00:00.066 - 00:00:00.147" "00:00:00.080 - 00:00:00.097" but would be interested in other solutions as well! Reproducible data: df <- structure(list(timestamp = c("00:00:00.033", "00:00:00.064", "00:00:00.066", "00:00:00.080", "00:00:00.097", "00:00:00.099", "00:00:00.114", "00:00:00.130", "00:00:00.133", "00:00:00.147" ), C = c(NA, NA, NA, 47.876, 47.876, 47.876, 47.876, 47.876, NA, NA), B = c(NA, NA, 0.346, 0.346, 0.346, 0.346, 0.346, 0.346, 0.346, 0.346), A = c(NA, NA, NA, 22.231, 22.231, NA, NA, NA, NA, NA), starttime_ms = c(33, 64, 66, 80, 97, 99, 114, 130, 133, 147)), row.names = c(NA, 10L), class = "data.frame")
dplyr solution library(tidyverse) df <- structure(list(timestamp = c("00:00:00.033", "00:00:00.064", "00:00:00.066", "00:00:00.080", "00:00:00.097", "00:00:00.099", "00:00:00.114", "00:00:00.130", "00:00:00.133", "00:00:00.147" ), C = c(NA, NA, NA, 47.876, 47.876, 47.876, 47.876, 47.876, NA, NA), B = c(NA, NA, 0.346, 0.346, 0.346, 0.346, 0.346, 0.346, 0.346, 0.346), A = c(NA, NA, NA, 22.231, 22.231, NA, NA, NA, NA, NA), starttime_ms = c(33, 64, 66, 80, 97, 99, 114, 130, 133, 147)), row.names = c(NA, 10L), class = "data.frame") df %>% pivot_longer(-c(timestamp, starttime_ms)) %>% group_by(name) %>% drop_na() %>% summarise(min = timestamp %>% min(), max = timestamp %>% max()) #> `summarise()` ungrouping output (override with `.groups` argument) #> # A tibble: 3 x 3 #> name min max #> <chr> <chr> <chr> #> 1 A 00:00:00.080 00:00:00.097 #> 2 B 00:00:00.066 00:00:00.147 #> 3 C 00:00:00.080 00:00:00.130 Created on 2021-02-15 by the reprex package (v0.3.0)
You could look at the cumsum of differences where there's no NA, coerce them to logical and subset first and last element. lapply(data.frame(apply(rbind(0, diff(!sapply(df[c("C", "B", "A")], is.na))), 2, cumsum)), function(x) c(df$timestamp[as.logical(x)][1], rev(df$timestamp[as.logical(x)])[1])) # $C # [1] "00:00:00.080" "00:00:00.130" # # $B # [1] "00:00:00.066" "00:00:00.147" # # $A # [1] "00:00:00.080" "00:00:00.097"
Computing Growth Rates
I am working on a dataset for a welfare wage subsidy program, where wages per worker are structured as follows: df <- structure(list(wage_1990 = c(13451.67, 45000, 10301.67, NA, NA, 8726.67, 11952.5, NA, NA, 7140, NA, NA, 10301.67, 7303.33, NA, NA, 9881.67, 5483.33, 12868.33, 9321.67), wage_1991 = c(13451.67, 45000, 10301.67, NA, NA, 8750, 11952.5, NA, NA, 7140, NA, NA, 10301.67, 7303.33, NA, NA, 9881.67, 5483.33, 12868.33, 9321.67 ), wage_1992 = c(13451.67, 49500, 10301.67, NA, NA, 8750, 11952.5, NA, NA, 7140, NA, NA, 10301.67, 7303.33, NA, NA, 9881.67, NA, 12868.33, 9321.67), wage_1993 = c(NA, NA, 10301.67, NA, NA, 8750, 11958.33, NA, NA, 7140, NA, NA, 10301.67, 7303.33, NA, NA, 9881.67, NA, NA, 9321.67), wage_1994 = c(NA, NA, 10301.67, NA, NA, 8948.33, 11958.33, NA, NA, 7140, NA, NA, 10301.67, 7303.33, NA, NA, 9881.67, NA, NA, 9321.67), wage_1995 = c(NA, NA, 10301.67, NA, NA, 8948.33, 11958.33, NA, NA, 7140, NA, NA, 10301.67, 7303.33, NA, NA, 9881.67, NA, NA, 9321.67), wage_1996 = c(NA, NA, 10301.67, NA, NA, 8948.33, 11958.33, NA, NA, 7291.67, NA, NA, 10301.67, 7303.33, NA, NA, 9881.67, NA, NA, 9321.67)), class = c("tbl_df", "tbl", "data.frame" ), row.names = c(NA, -20L)) I have tried one proposed solution, which is running this code after the one above: average_growth_rate <- apply(df, 1, function(x) { x1 <- x[!is.na(x)] mean(x1[-1]/x1[-length(x1)]-1)}) out <- data.frame(rowid = seq_len(nrow(df)), average_growth_rate) out[!is.na(out$average_growth_rate),] But I keep getting this error: Error in dim(X) <- c(n, length(X)/n) : dims [product 60000] do not match the length of object [65051] I want to do the following: 1-Create a variable showing the annual growth rate of wage for each worker or lack of thereof. The practical issue that I am facing is that each observation is in one row and while the first worker joined the program in 1990, others might have joined in say 1993 or 1992. Therefore, is there a way to apply the growth rate for each worker depending on the specific years they worked, rather than applying a general growth formula for all observations? My expected output for each row would be having a new column average wage growth rate 1- 15% 2- 9% 3- 12% After running the following code to see descriptive statistics of my variable of interest: skim(df$average_growth_rate) I get the following result: "Variable contains Inf or -Inf value(s) that were converted to NA.── Data Summary ──────────────────────── Values Name gosi_beneficiary_growth$a... Number of rows 3671 Number of columns 1 _______________________ Column type frequency: numeric 1 ________________________ Group variables None ── Variable type: numeric ────────────────────────────────────────────────────────────────────────────── skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist 1 data 1348 0.633 Inf Inf -1 -0.450 0 0.0568 " I am not sure why my mean and standard deviation values are Inf.
Here is one approach: library(tidyverse) growth <- df %>% rowid_to_column() %>% gather(key, value, -rowid) %>% drop_na() %>% arrange(rowid, key) %>% group_by(rowid) %>% mutate(yoy = value / lag(value)-1) %>% summarise(average_growth_rate = mean(yoy, na.rm=T)) # A tibble: 12 x 2 rowid average_growth_rate <int> <dbl> 1 1 0 2 2 0.05 3 3 0 4 6 0.00422 5 7 0.0000813 6 10 0.00354 7 13 0 8 14 0 9 17 0 10 18 0 11 19 0 12 20 0 And just to highlight that all these 0s are expected, here the dataframe: > head(df) # A tibble: 6 x 7 wage_1990 wage_1991 wage_1992 wage_1993 wage_1994 wage_1995 wage_1996 <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> 1 13452. 13452. 13452. NA NA NA NA 2 45000 45000 49500 NA NA NA NA 3 10302. 10302. 10302. 10302. 10302. 10302. 10302. 4 NA NA NA NA NA NA NA 5 NA NA NA NA NA NA NA 6 8727. 8750 8750 8750 8948. 8948. 8948. where you see that e.g. for the first row, there was no growth nor any decline. The second row, there was a slight increase in between the second and the third year, but it was 0 for the first and second. For the third row, again absolutely no change. Etc... Also, finally, to add these results to the initial dataframe, you would do e.g. df %>% rowid_to_column() %>% left_join(growth) And just to answer the performance question, here a benchmark (where I changed akrun's data.frame call to a tibble call to make sure there is no difference coming from this). All functions below correspond to creating the growth rates, not merging back to the original dataframe. library(microbenchmark) microbenchmark(cj(), akrun(), akrun2()) Unit: microseconds expr min lq mean median uq max neval cld cj() 5577.301 5820.501 6122.076 5988.551 6244.301 10646.9 100 c akrun() 998.301 1097.252 1559.144 1160.450 1212.552 28704.5 100 a akrun2() 2033.801 2157.101 2653.018 2258.052 2340.702 34143.0 100 b base R is the clear winner in terms of performance.
We can use base R with apply. Loop over the rows with MARGIN = 1, remove the NA elements ('x1'), get the mean of the ratio of the current and previous element average_growth_rate <- apply(df, 1, function(x) { x1 <- x[!is.na(x)] mean(x1[-1]/x1[-length(x1)]-1)}) out <- data.frame(rowid = seq_len(nrow(df)), average_growth_rate) out[!is.na(out$average_growth_rate),] # rowid average_growth_rate #1 1 0.00000000000 #2 2 0.05000000000 #3 3 0.00000000000 #6 6 0.00422328325 #7 7 0.00008129401 #10 10 0.00354038282 #13 13 0.00000000000 #14 14 0.00000000000 #17 17 0.00000000000 #18 18 0.00000000000 #19 19 0.00000000000 #20 20 0.00000000000 Or using tapply/stack na.omit(stack(tapply(as.matrix(df), row(df), FUN = function(x) mean(head(na.omit(x), -1)/tail(na.omit(x), -1) -1))))[2:1]