merging outputs from a loop - r

I have two datasets and named E and eF respectively.
E<- structure(list(Inception_Date = structure(c(962323200, 962323200,
810950400, 988675200, 1042502400, 1536624000), tzone = "UTC", class =
c("POSIXct","POSIXt")), Name = c("Calvert Social Index B", "Calvert US
Large Cap Core Rspnb Idx A", "Green Century Equity Individual
Investor", "Praxis Value Index A", "Vanguard FTSE Social Index I",
"Amundi IS Amundi MSCI USA SRI ETF DR")), row.names = c(NA, -6L),
class = c("tbl_df", "tbl", "data.frame"))
eF <- structure(list(Inception_Date = structure(c(760233600, 519868800,
1380067200, 1101772800, 1325203200, 628473600, 1325203200, 1123804800
), tzone = "UTC", class = c("POSIXct", "POSIXt")), Name = c("Amana
Growth Investor", "Amana Income Investor", "Amana Income
Institutional", "American Century Sustainable Equity A",
"Ariel Appreciation Institutional", "Ariel Appreciation Investor",
"Ariel Focus Institutional", "Baywood Socially Responsible Invs"
)), row.names = c(NA, -8L), class = c("tbl_df", "tbl", "data.frame"
))
I applied the following codes to the data E and eF.
for (k in 1:nrow(E)) {
F_temp <- eF;
G_temp <- F_temp %>% filter(abs(F_temp$Inception_Date-
E$Inception_Date[k]) <= 1500);
print(G_temp)}
As the "G_temp" under the "Global Environment" shows it as 0 obs. of 2 variables only (which must be the last components in the loop's list), how to make a .csv file that shows all the "G_temp" components merged together removing duplicates?
Thanks

Using your exact filter criteria would this do it?
G_temp <- data.frame(Inception_Date = as.POSIXct(character()),
Name = character())
for (k in 1:nrow(E)) {
G_temp_int <- eF %>%
filter(abs(eF$Inception_Date - E$Inception_Date[k]) <= 1500)
G_temp <- bind_rows(G_temp, G_temp_int)
}
G_temp <- G_temp %>%
distinct(Inception_Date, Name)
write.csv(G_temp, "G_temp.csv")

Related

Populate a new column in one table based on start and end dates in another table

I have a larger data table (called raw.data) and a smaller one (called balldrop.times) listing the start and end times of an event.
I want to create a new column in the larger data table that will fill up the times between the event start and end date that are located in the smaller table. The times that aren't between the event start/end time can be labeled something else, it doesn't really matter.
#the dput of the smaller table
> dput(balldrop.times)
structure(list(Stage = 6:14,
BallStart = structure(c(1635837081, 1635847841, 1635856675, 1635866152, 1635878326, 1635886132, 1635895547, 1635902934, 1635911136), tzone = "", class = c("POSIXct", "POSIXt")),
BallEnd = structure(c(1635837364, 1635848243, 1635857005, 1635866475, 1635878704, 1635886465, 1635895905, 1635903786, 1635911457), tzone = "", class = c("POSIXct", "POSIXt"))),
class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA, -9L))
#here is part of the larger table just in case
> dput(head(raw.data, 5))
structure(list(DateTime = structure(c(1635825603.6576, 1635825604.608, 1635825605.6448, 1635825606.6816, 1635825607.632), class = c("POSIXct", "POSIXt"), tzone = "GMT"),
Press.Well = c(1154.2561461, 1154.0308849, 1149.7247783, 1152.0544566, 1155.7363779),
row.names = c(NA, -5L),
class = c("data.table", "data.frame"), .internal.selfref = <pointer: 0x0000020725b51ef0>)
My desired output is something like the following, with "Event Active" only for the times between the listed DateTime vales in the balldrop.times table:
DateTime
Press.Well
Event Status
2021-11-02 02:11:20
10
Event Not Active
2021-11-02 02:11:21
10
Event Active
2021-11-02 02:11:22
15
Event Active
...
...
...
2021-11-02 02:16:04
25
Event Active
2021-11-02 02:16:05
30
Event Not Active
I am thinking I can use mutate() to create a new column in the raw.data table and set conditions for the DateTime, but I am not sure how to do this for multiple separate start/end DateTimes.
Any help would be appericated. Thank you.
Your code isn't working. Neither do the times in your example table correspond with the ones in your expected output.
tmp <- structure(list(Stage = 6:14,
BallStart = structure(c(1635837081, 1635847841, 1635856675, 1635866152, 1635878326, 1635886132, 1635895547, 1635902934, 1635911136), tzone = "", class = c("POSIXct", "POSIXt")),
BallEnd = structure(c(1635837364, 1635848243, 1635857005, 1635866475, 1635878704, 1635886465, 1635895905, 1635903786, 1635911457), tzone = "", class = c("POSIXct", "POSIXt"))
),
class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA, -9L))
tmp1 <- structure(list(DateTime = structure(c(1635825603.6576, 1635825604.608, 1635825605.6448, 1635825606.6816, 1635825607.632), class = c("POSIXct", "POSIXt"), tzone = "GMT"),
Press.Well = c(1154.2561461, 1154.0308849, 1149.7247783, 1152.0544566, 1155.7363779) ), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA, -5L))
So note this isn't a clean solution.
tmp1 %>%
mutate(`Event Status` = case_when(
DateTime >= (tmp[1,] %>% pull(BallStart)) & DateTime <= (tmp[1,] %>% pull(BallEnd)) ~ "Event Active",
DateTime >= (tmp[2,] %>% pull(BallStart)) & DateTime <= (tmp[2,] %>% pull(BallEnd)) ~ "Event Active",
DateTime >= (tmp[3,] %>% pull(BallStart)) & DateTime <= (tmp[3,] %>% pull(BallEnd)) ~ "Event Active",
DateTime >= (tmp[4,] %>% pull(BallStart)) & DateTime <= (tmp[4,] %>% pull(BallEnd)) ~ "Event Active",
DateTime >= (tmp[5,] %>% pull(BallStart)) & DateTime <= (tmp[5,] %>% pull(BallEnd)) ~ "Event Active",
TRUE ~ "Event Not Active"
))
Because you want to compare multiple conditions, case_when is the preferred option rather than ifelse. With that I compare it to every row in your reference table.
Now, like said it isn't a clean solution as you have many rows to specify it. With a bigger reference table to check the code will increase exponentionally. But you can clean it up into a function.

Identify and strip characters from colums

I have a large dataset in which I want identify and remove characters and signs to keep only the number value.
For example I want -£1125.91m to be -1125.91
dataset
Event var1 var2
<fct> <chr> <chr>
1 Labour Costs YoY 13.34m 0.026
2 Unemployment Change (000's) $16.91b -0.449
3 Unemployment Rate -£1125.91m 0.89k
4 Jobseekers Net Change ¥1012.74b 9.56m
At the moment I know how to remove a single character from the column. Like this:
dataset$`var1` <- gsub("k", "", dataset$`var`)
Doing this manually will be a lot of work because the dataset is really big. I was wondering if you can identify and remove all the characters, so also the currency symbols and the m's and b's all at once?
To replicate the dataset:
dataset <- structure(list(Event = structure(2:5, .Label = c("Event", "Labour Costs YoY",
"Unemployment Change (000's)", "Unemployment Rate", "Jobseekers Net Change"),
.Names = c("", "", "", ""), class = "factor"), var1 = c("13.34m", "$16.91b", "-£1125.91m", "¥1012.74b"), var2 = c(0.026, -0.449, "0.89k", "9.56m")), row.names = c(NA,
-4L), class = c("tbl_df", "tbl", "data.frame"))
To remove all but a hyphen, digit or a dot, you can use
dataset$var1 <- gsub("[^-0-9.]", "", dataset$var1)
The [^-0-9.] pattern is a negated character class that matches any char but the ones defined in the class.
See the regex demo online.
See an online R demo:
dataset <- structure(list(Event = structure(2:5, .Label = c("Event", "Labour Costs YoY",
"Unemployment Change (000's)", "Unemployment Rate", "Jobseekers Net Change"),
.Names = c("", "", "", ""), class = "factor"), var1 = c("13.34m", "$16.91b", "-£1125.91m", "¥1012.74b"), var2 = c(0.026, -0.449, "0.89k", "9.56m")), row.names = c(NA,
-4L), class = c("tbl_df", "tbl", "data.frame"))
gsub("[^-0-9.,]", "", dataset$var1)
## => [1] "13.34" "16.91" "-1125.91" "1012.74"

Find value of a row by comparing two columns and a value with a range of a different dataset

I have 2 different datasets. One with an object that comes from a StationX and goes to StationY and arrives at a specific date and time as the following.
df1<-structure(list(From = c("Station1", "Station5", "Station6", "Station10"), To = c("Station15", "Station2", "Station2", "Station7"),
Arrival = structure(c(971169720, 971172720, 971178120, 971179620), class = c("POSIXct", "POSIXt"), tzone = "UTC")), row.names = c(NA, -4L),class = c("tbl_df","tbl", "data.frame"))
In the Dataset2 are e.g. trucks which wait for the specific object at StationY between the time&date "Arrival" and "Departure" and leave at "Departure to a specifc region "TOID".
As in the following:
df2<-structure(list(TOID = c(2, 4, 7, 20), Station = c("Station15",
"Station2", "Station2","Station7"), Arrival = structure(c(971169600, 971172000, 971177700, 971179500), class = c("POSIXct", "POSIXt"), tzone = "UTC"), Departure1 = structure(c(971170200, 971173200, 971178600, 971179800), class = c("POSIXct", "POSIXt"), tzone = "UTC")), row.names = c(NA, -4L), class = c("tbl_df", "tbl", "data.frame"))
I want to look for the TOID in Dataset2 and add it to Dataset1 if "TO"(Dataset1)="Station"(Dataset2) and "Arrival"(Dataset2)<="Arrival"(Dataset1)<="Departure"(Dataset2) and has therefore the following outcome:
df1outcome<-structure(list(From = c("Station1", "Station5", "Station6", "Station10"
), To = c("Station15", "Station2", "Station2", "Station7"), `TO_ID` = c(2, 4, 7, 20), Arrival = structure(c(971169720, 971172720, 971178120, 971179620), class = c("POSIXct", "POSIXt"), tzone = "UTC")), row.names = c(NA, -4L), class = c("tbl_df", "tbl", "data.frame"))
I need a solution which looks in dataset2 for the ID that matches the conditions regardless the roworder.
Would be awesome if you guys could help me how to code this in R.
Best,
J
Perhaps you could use tidyverse, use a left_join based on the station, and then filter based on dates:
library(tidyverse)
df1 %>%
left_join(df2, by = c("To" = "Station"), suffix = c("1","2")) %>%
filter(Arrival1 >= Arrival2 & Arrival1 <= Departure1) %>%
select(-c(Arrival2, Departure1))
# A tibble: 4 x 4
From To Arrival1 TOID
<chr> <chr> <dttm> <dbl>
1 Station1 Station15 2000-10-10 09:22:00 2
2 Station5 Station2 2000-10-10 10:12:00 4
3 Station6 Station2 2000-10-10 11:42:00 7
4 Station10 Station7 2000-10-10 12:07:00 20
Im pretty new to R, so this code is probably longer then it should be. But does this work?
#renaming variables so its easier to merge the objects and to compare them
df1 <- df1 %>% rename(Arrival_Package = Arrival)
df2 <- df2 %>% rename(Arrival_Truck = Arrival)
#merge objects
df1outcome <- merge(df1, df2, by.x = "To", by.y = "Station")
#subset from object and select relevant columns
df1outcome <- subset(df1outcome, Arrival_Package <= Departure1)
df1outcome <- subset(df1outcome, Arrival_Truck <= Arrival_Package)
df1outcome <- df1outcome %>% select(From, To, TOID, Arrival_Package)

R: drop columns from tibbles inside a function

This is a followthrough of this topic. Here are my 3 tibbles:
dftest_tw <- structure(list(text = c("RT #BitMEXdotcom: A new high: US$500M turnover in the last 24 hours, over 80% of it on $XBTUSD. Congrats to the team and thank you to our u…",
"RT #Crowd_indicator: Thank you for this nice video, #Nicholas_Merten",
"RT #Crowd_indicator: Review of #Cindicator by DataDash: t.co/D0da3u5y3V"
), Tweet.id = c("896858423521837057", "896858275689398272", "896858135314538497"
), created.date = structure(c(17391, 17391, 17391), class = "Date"),
created.week = c(33, 33, 33)), .Names = c("text", "Tweet.id",
"created.date", "created.week"), row.names = c(NA, -3L), class = c("tbl_df",
"tbl", "data.frame"))
dftest1_tw <- dftest_tw
dftest2_tw <- dftest_tw
myUserList <- ls(,pattern = "_tw")
Following yesterday topic, I have the wanted result when running this:
library(tidyverse)
lst <- mget(myUserList) %>%
map2(myUserList, ~mutate(.data = .x, Twitter.name = .y)) %>%
list2env(lst, envir = .GlobalEnv)
I need to drop a few columns for each df. This do the job when running on one df:
select_(dftest_tw, quote(-text), quote(-Tweet.id), quote(-created.date))
It seems like I have a serious probelm when it comes to apply code to each member of a list. I can't find a way to apply it to all df when using lapply, or writing a function:
MySelect <- function(x){
select_(x, quote(-text), quote(-Tweet.id), quote(-created.date))
x
}
for(var in myUserList){MySelect(get(var))}
Thank you for your help.

R: add a new column to dataframes from a function

I have many tibbles similar to this:
dftest_tw <- structure(list(text = c("RT #BitMEXdotcom: A new high: US$500M turnover in the last 24 hours, over 80% of it on $XBTUSD. Congrats to the team and thank you to our u…",
"RT #Crowd_indicator: Thank you for this nice video, #Nicholas_Merten",
"RT #Crowd_indicator: Review of #Cindicator by DataDash: t.co/D0da3u5y3V"
), Tweet.id = c("896858423521837057", "896858275689398272", "896858135314538497"
), created.date = structure(c(17391, 17391, 17391), class = "Date"),
created.week = c(33, 33, 33)), .Names = c("text", "Tweet.id",
"created.date", "created.week"), row.names = c(NA, -3L), class = c("tbl_df",
"tbl", "data.frame"))
For testing, we add another one:
dftest2_tw <- dftest_tw
I have this list of my df:
myUserList <- ls(,pattern = "_tw")
What I am looking to do is:
1- add a new column named Twitter.name
2- fill the column with the df name, all this in a function. The following code works for each df taken one by one:
dftest_tw %>% rowwise() %>% mutate(Twitter.name = myUserList[1])
The desired result is this:
MyRes <- structure(list(text = c("RT #BitMEXdotcom: A new high: US$500M turnover in the last 24 hours, over 80% of it on $XBTUSD. Congrats to the team and thank you to our u…",
"RT #Crowd_indicator: Thank you for this nice video, #Nicholas_Merten",
"RT #Crowd_indicator: Review of #Cindicator by DataDash: t.co/D0da3u5y3V"
), Tweet.id = c("896858423521837057", "896858275689398272", "896858135314538497"
), created.date = structure(c(17391, 17391, 17391), class = "Date"),
created.week = c(33, 33, 33), retweet = c(0, 0, 0), custom = c(0,
0, 0), Twitter.name = c("dftest_tw", "dftest_tw", "dftest_tw"
)), .Names = c("text", "Tweet.id", "created.date", "created.week",
"retweet", "custom", "Twitter.name"), class = c("rowwise_df",
"tbl_df", "tbl", "data.frame"), row.names = c(NA, -3L))
When it comes to write a function to be thereafter been applied to all my df (more than 100), I can't achieve it. Any help would be appreciated.
We can use tidyverse options. Get the value of multiple string objects with mget, then with map2 from purrr, create the new column 'Twitter.name in each dataset of the list with corresponding string element of 'myUserList`
library(tidyverse)
lst <- mget(myUserList) %>%
map2(myUserList, ~mutate(.data = .x, Twitter.name = .y))
If we need to modify the objects in the global environment, use list2env
list2env(lst, envir = .GlobalEnv)

Resources