replace data.frame using specified data set - r

I want to replace data.frame using specified data set
> test_data
support count
1 0.01235235 663
2 0.01373104 737
3 0.01393598 748
4 0.01265045 679
5 0.01548236 831
6 0.01565004 840
> replace_support
2 3 4 6
-0.008884196 -0.007991622 -0.011675116 -0.013086012
names of replace_support corresponds with row name of test_data
my expect is replace the column support
support count
1 0.01235235 663
2 -0.008884196 737
3 -0.007991622 748
4 -0.011675116 679
5 0.01548236 831
6 -0.013086012 840
hare are the example data
test_data <- structure(list(support = c(0.0123523493684093, 0.0137310429630734,
0.0139359839028207, 0.0126504452807691, 0.0154823564481872, 0.0156500353988896
), count = c(663, 737, 748, 679, 831, 840)), .Names = c("support",
"count"), row.names = c(NA, 6L), class = "data.frame")
replace_support <- structure(c(-0.00888419577036815, -0.00799162193023339, -0.0116751160488589,
-0.0130860121134779), .Names = c("2", "3", "4", "6"))

You can use the replace function:
indexes <- as.integer(names(replace_support))
test_data$support <- replace(test_data$support,indexes,replace_support)
test_data
support count
1 0.012352349 663
2 -0.008884196 737
3 -0.007991622 748
4 -0.011675116 679
5 0.015482356 831
6 -0.013086012 840
If the names of replace_support don't match the appropriate indexes, you can supply them manually.

How about:
test_data$support[as.integer(names(replace_support))] <- replace_support
test_data
#> support count
#> 1 0.012352349 663
#> 2 -0.008884196 737
#> 3 -0.007991622 748
#> 4 -0.011675116 679
#> 5 0.015482356 831
#> 6 -0.013086012 840

Related

Use str_detect() to extract information from a column and then create a new column

I'm working with a data.frame that contains a column whose values are named like this: D1_open, D9_shurb, D10_open, etc
I would like to create a new column whose values are just "open" or "shurb". That is, I would like to extract the words "open" and "shrub" from "ID_SubPlot" and put them on a new column. I believe str_detect() can be useful, but I can't figure out how.
Example data:
test <- structure(list(ID_Plant = c(243, 370, 789, 143, 559, 588, 746,
618, 910, 898), ID_SubPlot = c("D1_open", "D9_shrub", "D8_open",
"E4_shrub", "U5_shrub", "U10_open", "S10_shrub", "U10_shrub",
"S9_shrub", "S9_shrub")), row.names = c(NA, -10L), class = c("tbl_df",
"tbl", "data.frame"))
Here is one approach using separate from tidyr:
library(tidyr)
separate(test, ID_SubPlot, into = c("Code", "NewCol"), sep = "_")
Output
ID_Plant Code NewCol
1 243 D1 open
2 370 D9 shrub
3 789 D8 open
4 143 E4 shrub
5 559 U5 shrub
6 588 U10 open
7 746 S10 shrub
8 618 U10 shrub
9 910 S9 shrub
10 898 S9 shrub
Regex (see also regex cheatsheet for R)
Simply use ".*_(.*)" to capture everything after _ in the first group and replace every string by the first captured group.
test$col = gsub(".*_(.*)", "\\1", test$ID_SubPlot)
test
ID_Plant ID_SubPlot col
1 243 D1_open open
2 370 D9_shrub shrub
3 789 D8_open open
4 143 E4_shrub shrub
5 559 U5_shrub shrub
6 588 U10_open open
7 746 S10_shrub shrub
8 618 U10_shrub shrub
9 910 S9_shrub shrub
10 898 S9_shrub shrub
Data
test=structure(list(ID_Plant = c(243, 370, 789, 143, 559, 588, 746, 618, 910, 898),
ID_SubPlot = c("D1_open", "D9_shrub", "D8_open", "E4_shrub", "U5_shrub", "U10_open", "S10_shrub", "U10_shrub", "S9_shrub", "S9_shrub")),
row.names = c(NA, -10L), class = c("data.frame"))
This could also help you. I assumed you would like to remove the ID part plus the underscore:
library(dplyr)
library(stringr)
test %>%
mutate(result = str_remove(ID_SubPlot, "^[A-Za-z]\\d+(_)"))
# A tibble: 10 x 3
ID_Plant ID_SubPlot result
<dbl> <chr> <chr>
1 243 D1_open open
2 370 D9_shrub shrub
3 789 D8_open open
4 143 E4_shrub shrub
5 559 U5_shrub shrub
6 588 U10_open open
7 746 S10_shrub shrub
8 618 U10_shrub shrub
9 910 S9_shrub shrub
10 898 S9_shrub shrub

how to select data based on a list from a split data frame and then recombine in R

I am trying to do the following. I have a dataset Test:
Item_ID Test_No Category Sharpness Weight Viscocity
132 1 3 14.93199362 94.37250417 579.4236727
676 1 4 44.58750591 70.03232054 1829.170727
699 2 5 89.02760079 54.30587287 1169.226863
850 3 6 30.74535903 83.84377678 707.2280513
951 4 237 67.79568019 51.10388484 917.6609965
1031 5 56 74.06697003 63.31274502 1981.17804
1175 4 354 98.9656142 97.7523884 100.7357981
1483 5 726 9.958040999 51.29537311 1222.910211
1529 7 800 64.11430235 65.69780939 573.8266137
1698 9 125 67.83105185 96.53847341 486.9620194
1748 9 1005 49.43602318 52.9139591 1881.740184
2005 9 28 26.89821508 82.12663209 1709.556135
2111 2 76 83.03593144 85.23622731 276.5088502
I would want to split this data based on Test_No and then compute the number of unique Category per Test_No and also the Median Category value. I chose to use split and Sappply in the following way. But, I am getting an error regarding a missing parenthesis. Is there anything wrong in my approach ? Please find my code below:
function(CatRange){
c(Cat_Count = length(unique(CatRange$Category)), Median_Cat = median(unique(CatRange$Category), na.rm = TRUE) )
}
CatStat <- do.call(rbind,sapply(split(Test, Test$Test_No), function(ModRange)))
Appending my question:
I would want to display the data containing the following information:
Test_No, Category, Median_Cat and Cat_Count
We can try with dplyr
library(dplyr)
Test %>%
group_by(Test_No) %>%
summarise(Cat_Count = n_distinct(Category),
Median_Cat = median(Category,na.rm = TRUE),
Category = toString(Category))
# Test_No Cat_Count Median_Cat Category
# <int> <int> <dbl> <chr>
#1 1 2 3.5 3, 4
#2 2 2 40.5 5, 76
#3 3 1 6.0 6
#4 4 2 295.5 237, 354
#5 5 2 391.0 56, 726
#6 7 1 800.0 800
#7 9 3 125.0 125, 1005, 28
Or if you prefer base R we can also try with aggregate
aggregate(Category~Test_No, CatRange, function(x) c(Cat_Count = length(unique(x)),
Median_Cat = median(x,na.rm = TRUE), Category = toString(x)))
As far as the function written is concerned I think there are some synatx issues in it.
new_func <- function(CatRange){
c(Cat_Count = length(unique(CatRange$Category)),
Median_Cat = median(unique(CatRange$Category), na.rm = TRUE),
Category = toString(CatRange$Category))
}
data.frame(t(sapply(split(CatRange, CatRange$Test_No), new_func)))
# Cat_Count Median_Cat Category
#1 2 3.5 3, 4
#2 2 40.5 5, 76
#3 1 6 6
#4 2 295.5 237, 354
#5 2 391 56, 726
#7 1 800 800
#9 3 125 125, 1005, 28

How many categories there are in a column in a list of data frame?

I have a list of data frames where the index indicates where one family ends and another begins. I would like to know how many categories there are in statepath column in each family.
In my below example I have two families, then I am trying to get a table wiht the frequency of each statepath category (233, 434, 323, etc) in each family.
My input:
List <-
'$`1`
Chr Start End Family Statepath
1 187546286 187552094 father 233
3 108028534 108032021 father 434
1 4864403 4878685 mother 323
1 18898657 18904908 mother 322
2 460238 461771 offspring 322
3 108028534 108032021 offspring 434
$’2’
Chr Start End Family Statepath
1 71481449 71532983 father 535
2 74507242 74511395 father 233
2 181864092 181864690 mother 322
1 71481449 71532983 offspring 535
2 181864092 181864690 offspring 322
3 160057791 160113642 offspring 335'
Thus, my expected output Freq_statepath would look like:
Freq_statepath <- ‘Statepath Family_1 Family_2
233 1 1
434 2 0
323 1 0
322 2 2
535 0 2
335 0 1’
I think you want something like this:
test <- list(data.frame(Statepath = c(233,434,323,322,322)),data.frame(Statepath = c(434,323,322,322)))
list_tables <- lapply(test, function(x) data.frame(table(x$Statepath)))
final_result <- Reduce(function(...) merge(..., by.x = "Var1", by.y = "Var1", all.x = T, all.y = T), list_tables)
final_result[is.na(final_result)] <- 0
> test
[[1]]
Statepath
1 233
2 434
3 323
4 322
5 322
[[2]]
Statepath
1 434
2 323
3 322
4 322
> final_result
Var1 Freq.x Freq.y
1 233 1 0
2 322 2 2
3 323 1 1
4 434 1 1

R: sum rows from column A until conditioned value in column B

I'm pretty new to R and can't seem to figure out how to deal with what seems to be a relatively simple problem. I want to sum the rows of the column 'DURATION' per 'TRIAL_INDEX', but then only those first rows where the values of 'X_POSITION" are increasing. I only want to sum the first round within a trial where X increases.
The first rows of a simplified dataframe:
TRIAL_INDEX DURATION X_POSITION
1 1 204 314.5
2 1 172 471.6
3 1 186 570.4
4 1 670 539.5
5 1 186 503.6
6 2 134 306.8
7 2 182 503.3
8 2 806 555.7
9 2 323 490.0
So, for TRIAL_INDEX 1, only the first three values of DURATION should be added (204+172+186), as this is where X has the highest value so far (going through the dataframe row by row).
The desired output should look something like:
TRIAL_INDEX DURATION X_POSITION FIRST_PASS_TIME
1 1 204 314.5 562
2 1 172 471.6 562
3 1 186 570.4 562
4 1 670 539.5 562
5 1 186 503.6 562
6 2 134 306.8 1122
7 2 182 503.3 1122
8 2 806 555.7 1122
9 2 323 490.0 1122
I tried to use dplyr, to generate a new dataframe that can be merged with my original dataframe.
However, the code doesn't work, and also I'm not sure on how to make sure it's only adding the first rows per trial that have increasing values for X_POSITION.
FirstPassRT = dat %>%
group_by(TRIAL_INDEX) %>%
filter(dplyr::lag(dat$X_POSITION,1) > dat$X_POSITION) %>%
summarise(FIRST_PASS_TIME=sum(DURATION))
Any help and suggestions are greatly appreciated!
library(data.table)
dt = as.data.table(df) # or setDT to convert in place
# find the rows that will be used for summing DURATION
idx = dt[, .I[1]:.I[min(.N, which(diff(X_POSITION) < 0), na.rm = T)], by = TRIAL_INDEX]$V1
# sum the DURATION for those rows
dt[idx, time := sum(DURATION), by = TRIAL_INDEX][, time := time[1], by = TRIAL_INDEX]
dt
# TRIAL_INDEX DURATION X_POSITION time
#1: 1 204 314.5 562
#2: 1 172 471.6 562
#3: 1 186 570.4 562
#4: 1 670 539.5 562
#5: 1 186 503.6 562
#6: 2 134 306.8 1122
#7: 2 182 503.3 1122
#8: 2 806 555.7 1122
#9: 2 323 490.0 1122
Here is something you can try with dplyr package:
library(dplyr);
dat %>% group_by(TRIAL_INDEX) %>%
mutate(IncLogic = X_POSITION > lag(X_POSITION, default = 0)) %>%
mutate(FIRST_PASS_TIME = sum(DURATION[IncLogic])) %>%
select(-IncLogic)
Source: local data frame [9 x 4]
Groups: TRIAL_INDEX [2]
TRIAL_INDEX DURATION X_POSITION FIRST_PASS_TIME
(int) (int) (dbl) (int)
1 1 204 314.5 562
2 1 172 471.6 562
3 1 186 570.4 562
4 1 670 539.5 562
5 1 186 503.6 562
6 2 134 306.8 1122
7 2 182 503.3 1122
8 2 806 555.7 1122
9 2 323 490.0 1122
If you want to summarize it down to one row per trial you can use summarize like this:
library(dplyr)
df <- data_frame(TRIAL_INDEX = c(1,1,1,1,1,2,2,2,2),
DURATION = c(204,172,186,670, 186,134,182,806, 323),
X_POSITION = c(314.5, 471.6, 570.4, 539.5, 503.6, 306.8, 503.3, 555.7, 490.0))
res <- df %>%
group_by(TRIAL_INDEX) %>%
mutate(x.increasing = ifelse(X_POSITION > lag(X_POSITION), TRUE, FALSE),
x.increasing = ifelse(is.na(x.increasing), TRUE, x.increasing)) %>%
filter(x.increasing == TRUE) %>%
summarize(FIRST_PASS_TIME = sum(X_POSITION))
res
#Source: local data frame [2 x 2]
#
# TRIAL_INDEX FIRST_PASS_TIME
# (dbl) (dbl)
#1 1 1356.5
#2 2 1365.8

Inner join using an inequality expression

Background
(Not required for the question, but may be useful to read)
Rolling join on data.table with duplicate keys
Odd behaviour when joining with multiple conditions
Data
library(data.table) ## using version 1.9.6
## arrival timetable
dt_arrive <- structure(list(txn_id = c(1L, 1L, 1L, 1L, 1L), place = c("place_a",
"place_a", "place_a", "place_a", "place_a"), arrival_minutes = c(515,
534, 547, 561, 581), journey_id = 1:5), .Names = c("txn_id",
"place", "arrival_minutes", "journey_id"), class = c("data.table",
"data.frame"), row.names = c(NA, -5L), sorted = c("txn_id",
"place"))
## departure timetable
dt_depart <- structure(list(txn_id = c(1L, 1L, 1L, 1L), place = c("place_a",
"place_a", "place_a", "place_a"), arrival_minutes = c(489, 507,
519, 543), journey_id = 10:13), .Names = c("txn_id", "place",
"arrival_minutes", "journey_id"), sorted = c("txn_id", "place"
), class = c("data.table", "data.frame"), row.names = c(NA, -4L
))
> dt_arrive
txn_id place arrival_minutes journey_id
1: 1 place_a 515 1
2: 1 place_a 534 2
3: 1 place_a 547 3
4: 1 place_a 561 4
5: 1 place_a 581 5
> dt_depart
txn_id place arrival_minutes journey_id
1: 1 place_a 489 10
2: 1 place_a 507 11
3: 1 place_a 519 12
4: 1 place_a 543 13
Question
I would like to join the arrivals to the departures for only those dt_depart$journey_id that occur after dt_arrive$journey_id in terms of arrival_minutes (i.e. an inner join on txn_id & place)
For example, the output I would like is:
txn_id place journey_in_id journey_out_id journey_place_arrive journey_place_depart
1 place_a 1 12 515 519
1 place_a 1 13 515 543
1 place_a 2 13 534 543
Attempts
Using the method from the two linked questions I have constructed
setkey(dt_arrive, txn_id, place)
setkey(dt_depart, txn_id, place)
dt_join <- dt_arrive[dt_depart,
{
idx = (i.arrival_minutes > arrival_minutes)
.(journey_in_id = journey_id[idx],
journey_out_id = i.journey_id,
journey_place_arrive = arrival_minutes[idx],
journey_place_depart = i.arrival_minutes
)
},
by=.EACHI]
But this gives everything from dt_depart, so includes NAs in the result - which suggests a 'right join':
txn_id place journey_in_id journey_out_id journey_place_arrive journey_place_depart
1: 1 place_a NA 10 NA 489
2: 1 place_a NA 11 NA 507
3: 1 place_a 1 12 515 519
4: 1 place_a 1 13 515 543
5: 1 place_a 2 13 534 543
I've tried using nomatch=0 to force it to 'inner join', but this hasn't worked.
I can use complete.cases to remove the NA rows, but I was wondering if there's a way of doing this within the query itself?
Here's the unclever approach: take the cross/Cartesian join, and then filter.
merge(dt_arrive, dt_depart, allow.cartesian=TRUE)[arrival_minutes.y > arrival_minutes.x]
# txn_id place arrival_minutes.x journey_id.x arrival_minutes.y journey_id.y
# 1: 1 place_a 515 1 519 12
# 2: 1 place_a 515 1 543 13
# 3: 1 place_a 534 2 543 13
By taking the Cartesian join, we're liable to eat up a lot of memory.
A potential solution is to use foverlaps by making some arbitrary interval columns
setDT(dt_arrive)
setDT(dt_depart)
dt_arrive[, `:=`(arrival_minutes_copy = arrival_minutes)]
## reorder columns
dt_arrive <- dt_arrive[, .(txn_id, place, journey_id, arrival_minutes, arrival_minutes_copy)]
dt_depart[, `:=`(arrival_minutes_copy = min(arrival_minutes))]
## reorder columns
dt_depart <- dt_depart[, .(txn_id, place, journey_id, arrival_minutes_copy, arrival_minutes)]
setkey(dt_arrive, arrival_minutes, arrival_minutes_copy)
setkey(dt_depart, arrival_minutes_copy, arrival_minutes)
foverlaps(dt_arrive,
dt_depart,
type = "within",
nomatch=0L)
# place txn_id journey_id arrival_minutes_copy arrival_minutes i.txn_id i.journey_id i.arrival_minutes i.arrival_minutes_copy
# 1: place_a 1 12 489 519 1 1 515 515
# 2: place_a 1 13 489 543 1 1 515 515
# 3: place_a 1 13 489 543 1 2 534 534
Benchmarking
library(microbenchmark)
fun_foverlap <- function(dt_a, dt_d){
dt <- foverlaps(dt_a,
dt_d,
type = "within",
nomatch=0L)
return(dt)
}
fun_merge <- function(dt_a, dt_d){
dt <- merge(dt_a, dt_d, allow.cartesian=TRUE)[arrival_minutes.y > arrival_minutes.x]
return(dt)
}
fun_nomatch <- function(dt_a, dt_d){
dt <- dt_a[dt_d, nomatch=0, allow.cartesian=TRUE][i.arrival_minutes > arrival_minutes]
return(dt)
}
microbenchmark(fun_foverlap(dt_arrive_foverlap, dt_depart_foverlap),
fun_merge(dt_arrive_merge, dt_depart_merge),
fun_nomatch(dt_arrive_nomatch, dt_depart_nomatch))
# Unit: microseconds
expr min lq mean median uq max neval cld
# fun_foverlap(dt_arrive_foverlap, dt_depart_foverlap) 3538.189 3717.077 3967.6648 3872.586 4006.7205 5812.355 100 c
# fun_merge(dt_arrive_merge, dt_depart_merge) 883.697 925.655 980.4159 942.877 967.9745 2223.147 100 b
# fun_nomatch(dt_arrive_nomatch, dt_depart_nomatch) 593.082 625.471 682.8975 643.034 665.4125 2077.748 100 a

Resources