Multiplication in R of specific portion of a dataframe - r

I have a dataset from 1966 to 2002, I want change the units(multiply values by 0.305) of some of the values in the dataframe from 1967 to 1973and want the rest of the values to remain as they are.
Sample Data
Date A01
1 1966/05/07 4.870000
2 1966/05/08 4.918333
3 1966/05/09 4.892000
4 1966/05/10 4.858917
5 1966/05/11 4.842000
6 1967/03/18 4.89517
7 1966/05/07 4.870000
8 1966/05/08 4.918333
9 1966/05/09 4.892000
10 2000/05/10 2.858917
11 2001/05/11 1.842000
12 2002/03/18 0.89517
Desired Outcome
Date A01
1 1966/05/07 1.4843
2 1966/05/08 1.4990
3 1966/05/09 1.49108
4 1966/05/10 1.480992
5 1966/05/11 1.48565
6 1967/03/18 1.4920
7 1966/05/07 1.4843
8 1966/05/08 1.4991
9 1966/05/09 1.4910
10 2000/05/10 2.858917
11 2001/05/11 1.842000
12 2002/03/18 0.89517

An option in base R would be get the 'year' part from the Date column to create a logical index ('i1'), subset the 'A01' column, multiply by 0.305 and assign it back to the original column
i1 <- as.numeric(format(as.Date(df1$Date, '%Y/%m/%d'), "%Y")) %in% 1966:1973
df1$A01[i1] <- df1$A01[i1] * 0.305
data
df1 <- structure(list(Date = c("1966/05/07", "1966/05/08", "1966/05/09",
"1966/05/10", "1966/05/11", "1967/03/18", "1966/05/07", "1966/05/08",
"1966/05/09", "2000/05/10", "2001/05/11", "2002/03/18"), A01 = c(4.87,
4.918333, 4.892, 4.858917, 4.842, 4.89517, 4.87, 4.918333, 4.892,
2.858917, 1.842, 0.89517)), class = "data.frame", row.names = c("1",
"2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12"))

Convert Date to date class, extract the year from it and multiply A01 with 0.305 if it is between 1967 and 1974 or with 1 otherwise.
library(dplyr)
library(lubridate)
df %>%
mutate(Date = ymd(Date),
A01 = A01 * c(1, 0.305)[(between(year(Date), 1967, 1974)) + 1])

Another base R option using ifelse
transform(
df,
A01 = A01 * ifelse(as.numeric(format(as.Date(Date), "%Y")) %in% 1967:1973, 0.305, 1)
)

Related

How to calculate a overall score based on matrix positions?

I have a dataframe consisting of 12 columns with different participants, in a top 5. It looks like this:
> top_5
4 5 8 9 11 12 15 16 19 20 22 23
[1,] "Nia" "Hung" "Hanaaa" "Ramziyya" "Marissa" "Jaelyn" "Shyanne" "Jaabir" "Dionicio" "Nia" "Shyanne" "Roger"
[2,] "Razeena" "Husni" "Bradly" "Marissa" "Bradly" "Muhsin" "Razeena" "Dionicio" "Magnus" "Kelsey" "Nia" "Schyler"
[3,] "Shyanne" "Schyler" "Necko" "Johannah" "Tatiana" "Glenn" "Nia" "Jaelyn" "Shyanne" "Hanaaa" "Mildred" "German"
[4,] "Schyler" "German" "Hung" "Lubaaba" "Johannah" "Magnus" "Dionicio" "German" "German" "Razeena" "Dionicio" "Jaabir"
[5,] "Husni" "Necko" "Razeena" "Afeefa" "Schyler" "Dionicio" "Jaabir" "Roger" "Johannah" "Remy" "Jaabir" "Jaelyn"
(And can be recreated using this):
structure(c("Nia", "Razeena", "Shyanne", "Schyler", "Husni",
"Hung", "Husni", "Schyler", "German", "Necko", "Hanaaa", "Bradly",
"Necko", "Hung", "Razeena", "Ramziyya", "Marissa", "Johannah",
"Lubaaba", "Afeefa", "Marissa", "Bradly", "Tatiana", "Johannah",
"Schyler", "Jaelyn", "Muhsin", "Glenn", "Magnus", "Dionicio",
"Shyanne", "Razeena", "Nia", "Dionicio", "Jaabir", "Jaabir",
"Dionicio", "Jaelyn", "German", "Roger", "Dionicio", "Magnus",
"Shyanne", "German", "Johannah", "Nia", "Kelsey", "Hanaaa", "Razeena",
"Remy", "Shyanne", "Nia", "Mildred", "Dionicio", "Jaabir", "Roger",
"Schyler", "German", "Jaabir", "Jaelyn"), .Dim = c(5L, 12L), .Dimnames = list(
NULL, c("4", "5", "8", "9", "11", "12", "15", "16", "19",
"20", "22", "23")))
Now if a participant is in the top row, it means that they are in first place in that column (so for the 1st column "Nia" is first, "Razeena" is second, etc.). A first place in the ranking is worth 5 points, the second place 4 points, etc. Now I want to calculate for each participant in the matrix her/his points.
My goal is to make an overall top 5. How would I go about this?
Here is a "convert to long then summarise by group" method similar to M--'s answer, but with data.table
library(data.table)
df <- as.data.table(top_5)[, points := .N:1]
total_points <- melt(df, 'points')[, .(points = sum(points)), value]
setorder(total_points, -points)
head(total_points, 5)
# value points
# 1: Nia 17
# 2: Shyanne 16
# 3: Dionicio 14
# 4: Razeena 11
# 5: Schyler 10
Or an idea very similar to akrun's, just using tapply in place of sapply + split
out <- sort(tapply(c(6 - row(top_5)), c(top_5), sum), decreasing = TRUE)
head(out, 5)
# Nia Shyanne Dionicio Razeena Schyler
# 17 16 14 11 10
An option is to split the row index reversed with the matrix values into a list and get the sum of each list element by looping over the list (sapply)
out <- sapply(split(row(top_5)[nrow(top_5):1, ], top_5), sum)
out
#Afeefa Bradly Dionicio German Glenn Hanaaa Hung Husni Jaabir Jaelyn Johannah Kelsey Lubaaba Magnus Marissa Mildred Muhsin
# 1 8 14 9 3 8 7 5 9 9 6 4 2 6 9 3 4
# Necko Nia Ramziyya Razeena Remy Roger Schyler Shyanne Tatiana
# 4 17 5 11 1 6 10 16 3
head(out[order(-out)], 5)
# Nia Shyanne Dionicio Razeena Schyler
# 17 16 14 11 10
Or another option is rowsum
rowsum(c(row(top_5)[nrow(top_5):1, ]), group = c(top_5))
Using tidyverse functions:
library(tidyr)
library(dplyr)
top_5 %>%
as.data.frame %>%
head(.,5) %>%
mutate(rank = nrow(.):1) %>%
pivot_longer(., -c(rank), values_to = "name", names_to = "col") %>%
group_by(name) %>%
summarise_at(vars(rank), list(points = sum))
#> # A tibble: 26 x 2
#> name points
#> <fct> <int>
#> 1 Husni 5
#> 2 Nia 17
#> 3 Razeena 11
#> 4 Schyler 10
#> 5 Shyanne 16
#> 6 German 9
#> 7 Hung 7
#> 8 Necko 4
#> 9 Bradly 8
#> 10 Hanaaa 8
#> # ... with 16 more rows

Add stringA or stringB to values in column based on condition

I have a data table "dates" such as:
dates <- data.frame(date1=c("2015","1998","2000","1991"),
date2=c("98","00","18","92"))
dates <- mutate_if(dates,is.factor,as.character)
Where the values in "dates" are of class -char
I want to make "date2" a 4-digit number. For this I would like the following condition:
If "date2" starts with 9 add a 19 before the value
If "date2" starts with anything else add a 20
I have done a lot of research but I cannot find how to add a string to an already existing string by using a conditional
Afterthought: How can we deal with "NA" values so it does not assign a "19" or "20" to "NA´s"
A regex-free alternative:
d2int <- as.integer(dates$date2)
dates[["date2n"]] <- as.character(d2int + ifelse(d2int > 18, 1900, 2000))
dates
date1 date2 date2n
1 2015 98 1998
2 1998 00 2000
3 2000 18 2018
4 1991 92 1992
5 2015 89 1989
6 1998 18 2018
7 2000 19 1919
8 1991 NA <NA>
Where:
dates <- data.frame(
date1=c("2015","1998","2000","1991"),
date2=c("98","00","18","92", "89", "18", "19", "NA"),
stringsAsFactors = FALSE
)
you can use lubridate and try something like :
Input:
dates <- data.frame(date1=c("2015","1998","2000","1991", "1991", "1991"),
date2=c("98","00","18","92", "88", NA))
use:
dates %>%
mutate(date2 = as.integer(date2)) %>%
mutate(date3 = if_else(date2+2000 > year(today()), date2+1900, date2+2000))
which gives:
date1 date2 date3
1 2015 98 1998
2 1998 0 2000
3 2000 18 2018
4 1991 92 1992
5 1991 88 1988
6 1991 NA NA
p.s. added two rows to the input data to show how this handles NA values

R: Loop through data frame, extract subset of multiple variables, then store in an aggregate dataset

I have an aggregate data table with about 60 million rows. Simplified, the data looks like this:
ServiceN Customer Product LValue EDate CovBDate CovEDate
1 1 12 3 2016-08-03 2016-07-07 2017-07-06
2 1 12 19 2016-07-07 2016-07-07 2017-07-06
3 2 23 222 2017-09-09 2016-10-01 2017-09-31
4 2 23 100 2017-10-01 2017-10-01 2018-09-31
I need to go through each row and subset the entire dataset by Customer with all entry dates(EDate) between CovBDate and CovEDate. Then, I need to find the sum of the LValue for each product (we're only looking at 10, so it's not terrible).
As an example, the final dataset would look something like this:
ServiceN Customer Product LValue EDate CovBDate CovEDate Prod12 Prod23
1 1 12 3 2016-08-03 2016-07-07 2017-07-06 22 0
2 1 12 19 2016-07-07 2016-07-07 2017-07-06 22 0
3 2 23 222 2017-09-09 2016-10-01 2017-09-31 0 222
4 2 23 100 2017-10-01 2017-10-01 2018-09-31 0 100
I don't know where to begin on this problem, however, I've started with this (which does not work):
for (i in 1:length(nrow)) {
tempdata<-dataset[Customer==Customer[i] & EDate>=CovBDate[i] &
EDate<=CovEDate[i]] #data.table subsetting
tempdata$Prod12<- with(tempdata, sum(LValue[Product== "12"], na.rm=T))
#I could make this a function, but I want to get this for loop automated first...
tempdata$Prod23<- with(tempdata, sum(LValue[Product=="23"], na.rm=T))
}
My questions, therefore, are:
1) How do I make this for loop work with so many variables?
2) How do I make the new variable get added to the original dataset (called dataset)?
Using dplyr you could do something like this:
library(dplyr)
dataset <- data.frame(ServiceN = c("1", "2", "3", "4"),
Customer = c("1", "1", "2", "2"),
Product = c("12", "12", "23", "23"),
LValue = c(3, 19, 222, 100),
EDate = c("2016-08-03", "2016-07-07", "2017-09-09", "2017-10-01"),
CovBDate = c("2016-07-07", "2016-07-07", "2016-10-01", "2017-10-01"),
CovEDate = c("2017-07-06", "2017-07-06", "2017-09-31", "2018-09-31"),
stringsAsFactors = FALSE)
## Group by customer and product so summary results are per-customer/product combination
dataset %>% group_by(Customer, Product) %>%
## Filter based on dates
filter(EDate >= CovBDate & EDate <= CovEDate) %>%
## Sum the LValue based on the defined groupings
summarise(Sum = sum(LValue))
## A tibble: 2 x 3
## Groups: Customer [?]
# Customer Product Sum
#<chr> <chr> <dbl>
#1 1 12 22
#2 2 23 322

Sum consecutive strings in multiple columns

I have a table with flight ids, arrivals, and departures:
> test
arrival departure flight_id
1 9 2233
2 8 1982
3 1 2164
4 9 2081
5 2130
6 2 2040
7 9 2030
8 2130
9 4 3169
10 6 2323
11 8 2130
12 2220
13 3169
14 9 2204
15 1 1910
16 2 837
17 1994
18 9 8 1994
19 1994
20 1994
21 9 1 2338
22 1 8 1981
23 9 2365
24 8 2231
25 9 2048
My objective is to count only the rows where arrival and departure are blank, and then to aggregate by flight_id. But there is a catch. I believe this cannot be done with table(), aggregate() or rle() because they do not account for breaks.
For example, only consecutive flight ids where arrival ="" and departure ="" should be counted, and the count should start again from zero if a flight id with a non-blank value occurs. NOTE: Other flight ids appearing in between don't matter - each flight id should be treated separately which is why flight 2130 is counted twice.
In other words, the resulting output from the test should look exactly like this:
output
flight_id count
1 2130 2
2 2220 1
3 3169 1
4 1994 1
5 1994 2
Notice that flight id 1994 occurs three times where arrival and departure are blank but that there is a break in between at row 18. Therefore, the flight id must be counted twice.
I have tried writing a for loop but get an error that there is missing value where TRUE/FALSE needed:
raw_data = test
unique_id = unique(raw_data$flight_id)
output<- data.frame("flight_id"= integer(0), "count" = integer(0), stringsAsFactors=FALSE)
for (flight_id in unique_id)
{
oneflight <- raw_data[ which(raw_data$flight_id == flight_id), ]
if(nrow(oneflight) >= 1 ){
for(i in 2:nrow(oneflight)) {
if(oneflight[i,"arrival"] == "" & oneflight[i,"departure"] == "") {
new_row <- c(flight_id, sum(flight_id)[i])
output[nrow(output) + 1,] = new_row
}
}
}
}
How could one improve the above code or could someone suggest a quicker method with dplyr for example? Here is a sample of the data:
> dput(test)
structure(list(arrival = c("", "", "1", "", "", "2", "9", "",
"", "6", "", "", "", "", "1", "", "", "9", "", "", "9", "1",
"9", "", "9"), departure = c("9", "8", "", "9", "", "", "", "",
"4", "", "8", "", "", "9", "", "2", "", "8", "", "", "1", "8",
"", "8", ""), flight_id = c(2233, 1982, 2164, 2081, 2130, 2040,
2030, 2130, 3169, 2323, 2130, 2220, 3169, 2204, 1910, 837, 1994,
1994, 1994, 1994, 2338, 1981, 2365, 2231, 2048)), .Names = c("arrival",
"departure", "flight_id"), row.names = c(NA, 25L), class = "data.frame")
A base R approach :
do.call("rbind", lapply(split(test, test$flight_id), function(x) {
o = rle(x[["arrival"]] == "" & x[["departure"]] == "")
data.frame(flight_id = rep(unique(x[["flight_id"]]), sum(o$values)),
count = o$lengths[o$values])
}))
#flight_id count
# 1994 1
# 1994 2
# 2130 2
# 2220 1
# 3169 1
We split the dataframe by flight_id and for every group we apply rle to find continuous empty rows in arrival and departure and return a dataframe with the flight_id and the number of continuous empty rows in the group.
If I understand your question, one trick you could use is to add a decimale to the flight_id, indicating a group.
For example, get an index vector
i <- find(oneflight$arrival == "" & oneflight$departure =="")
Then take cumsum(1-diff(i)) / 100 or a sufficiently large power of ten, add it to the flight IDs, and you then have groups flights that can be counted with table()
Here's a solution using data.table:
library(data.table)
flights <- test$flight_id[test$arrival=="" & test$departure==""]
setDT(test)[flight_id %in% flights, grp := rleid(arrival=="",departure=="")][
arrival=="" & departure=="",.(count = .N),.(flight_id, grp)]
# flight_id grp count
#1: 2130 1 2
#2: 2220 3 1
#3: 3169 3 1
#4: 1994 3 1
#5: 1994 5 2
Explanation:
First we attain the flight_id's that have at least one record with empty arrival and departure values. Then, we use this vector flights to subset your data and generate a run-length id column based on arrival=="" and departure =="" called "grp". Lastly we generate the count of of records (ie. .N) where, arrival=="" & departure =="", grouped by the columns flight_id and grp.
You can consequently drop the grp column if needed.

Organizing Multidimensional Data in R

I am trying to organize multidimensional data in R. The data is extracted in R from CSV file. My data in data frame of R is, as following:
Rank Arrangers YearAmt
1994
1 JPM 6,605.00
2 UBS 7,806.00
3 RBS 1,167.34
1995
1 Citi 1,150.00
2 Scotiabank 483.33
3 ING 800.56
4 UniCredit 700.70
This is just a toy data. Original dataset is large. I would like to subset the data by year like 1994, 1995 etc. So that I can conduct some analysis. I have tried to subset the data set by factor/level using sapply and subset. But, I realized R is just treating 1994 and 1995 as a data in a row. I am thinking to format the original csv file by creating Year as a separate column and then putting a corresponding year in a field for all the rows.
I would appreciate any help in suggesting a way to organize data in R. I am expecting an output like this:
Rank Arrangers YearAmt Year
1 JPM 6,605.00 1994
2 UBS 7,806.00 1994
3 RBS 1,167.34 1994
1 Citi 1,150.00 1995
2 Scotiabank 483.33 1995
3 ING 800.56 1995
4 UniCredit 700.70 1995
1) ave Using cumsum(Rank == "") to create a grouping variable for years, this uses ave to create a Year column creating within each group of year rows a Year consisting of NA followed by the year repeated. Finally use na.omit to remove the rows with NA. No packages are used:
na.year <- function(x) c(NA, rep(x[1], length(x) - 1)) # c(NA, x[1], x[1], ..., x[1])
na.omit( transform(df1, Year = ave(YearAmt, cumsum(Rank == ""), FUN = na.year)) )
Using the input df1 reproducibly defined in the answer from #akrun we get:
Rank Arrangers YearAmt Year
2 1 JPM 6,605.00 1994
3 2 UBS 7,806.00 1994
4 3 RBS 1,167.34 1994
6 1 Citi 1,150.00 1995
7 2 Scotiabank 483.33 1995
8 3 ING 800.56 1995
9 4 UniCredit 700.70 1995
2) by Using by split df1 into years applying addYear to each component of the split. Finally put them back together. No packages are used.
addYear <- function(x) cbind(x[-1, ], Year = x[1, "YearAmt"])
do.call("rbind", by(df1, cumsum(df1$Rank == ""), addYear))
3) sqldf Using the sqldf package we can join each row of df1 with all prior rows of itself having a zero length rank Rank taking the maximum YearAmt of those to form the Year. Then keep only those rows having a non-zero length Rank.
library(sqldf)
sqldf("select b.*, max(a.YearAmt) Year
from df1 a join df1 b on a.rowid < b.rowid and a.Rank = ''
group by b.rowid
having b.Rank != ''")
We create a logical vector based on the blank elements in 'Rank' ('i1'), then subset the rows of 'df1' by removing all the blank rows using 'i1' (df1[!i1,]) and transform the dataset to create the 'Year' column by replicating the 'YearAmt' (that corresponds to the blank in 'Rank') using the cumulative sum of 'i1'.
i1 <- df1$Rank == ''
res <- transform(df1[!i1,], Year = df1$YearAmt[i1][cumsum(i1)[!i1]])
res
# Rank Arrangers YearAmt Year
#2 1 JPM 6,605.00 1994
#3 2 UBS 7,806.00 1994
#4 3 RBS 1,167.34 1994
#6 1 Citi 1,150.00 1995
#7 2 Scotiabank 483.33 1995
#8 3 ING 800.56 1995
#9 4 UniCredit 700.70 1995
Or as #G.Grothendieck mentioned in the comments, the transform step can be made compact by
res <- transform(df1, Year = YearAmt[i1][cumsum(i1)])[!i1, ]
row.names(res) <- NULL
NOTE: No external packages are needed. Only baseverse..
Or using dtverse/zooverse
library(data.table)
library(zoo)
setDT(df1)[Rank=='', Year:= YearAmt][, Year := na.locf(Year)][Rank!='']
# Rank Arrangers YearAmt Year
#1: 1 JPM 6,605.00 1994
#2: 2 UBS 7,806.00 1994
#3: 3 RBS 1,167.34 1994
#4: 1 Citi 1,150.00 1995
#5: 2 Scotiabank 483.33 1995
#6: 3 ING 800.56 1995
#7: 4 UniCredit 700.70 1995
data
df1 <- structure(list(Rank = c("", "1", "2", "3", "", "1", "2", "3",
"4"), Arrangers = c("", "JPM", "UBS", "RBS", "", "Citi", "Scotiabank",
"ING", "UniCredit"), YearAmt = c("1994", "6,605.00", "7,806.00",
"1,167.34", "1995", "1,150.00", "483.33", "800.56", "700.70")),
.Names = c("Rank",
"Arrangers", "YearAmt"), row.names = c(NA, -9L), class = "data.frame")
A tidyverse option:
library(dplyr)
library(tidyr)
# add Year column, with NAs where no year in row
df %>% mutate(Year = ifelse(Rank == '' & Arrangers == '', YearAmt, NA)) %>%
# fill year downwards
fill(Year) %>%
# chop out year rows
filter(Rank != '', Arrangers != '')
## Rank Arrangers YearAmt Year
## 1 1 JPM 6,605.00 1994
## 2 2 UBS 7,806.00 1994
## 3 3 RBS 1,167.34 1994
## 4 1 Citi 1,150.00 1995
## 5 2 Scotiabank 483.33 1995
## 6 3 ING 800.56 1995
## 7 4 UniCredit 700.70 1995

Resources