expand data.frame and insert average in another column - r

If the data set is
date CPI
2000/ 1 1.2
2000/ 2 3.2
2000/ 3 1.6
then I want to get a weekly cpi
So this is my expected result.
date CPI Average
2000/ 1 1.2 0.3
2000/ 1 1.2 0.3
2000/ 1 1.2 0.3
2000/ 1 1.2 0.3
2000/ 2 3.2 0.8
2000/ 2 3.2 0.8
2000/ 2 3.2 0.8
2000/ 2 3.2 0.8
2000/ 3 1.6 0.4
2000/ 3 1.6 0.4
2000/ 3 1.6 0.4
2000/ 3 1.6 0.4
How Can I do this in R program?
please help me. my monthly Cpi is almost 200.

May be this helps:
n <- 4
mydf1 <- transform(mydf, Average=CPI/n) #created a new column `Average` by dividing CPI by n
mydf2 <-mydf1[rep(1:nrow(mydf1),each=n),] #replicate the row numbers of the dataset `mydf1` by `n` and used the numeric index to expand the rows of `mydf1`
row.names(mydf2) <- 1:nrow(mydf2) #change the rownames
mydf2
# date CPI Average
#1 2000/ 1 1.2 0.3
#2 2000/ 1 1.2 0.3
#3 2000/ 1 1.2 0.3
#4 2000/ 1 1.2 0.3
#5 2000/ 2 3.2 0.8
#6 2000/ 2 3.2 0.8
#7 2000/ 2 3.2 0.8
#8 2000/ 2 3.2 0.8
#9 2000/ 3 1.6 0.4
#10 2000/ 3 1.6 0.4
#11 2000/ 3 1.6 0.4
#12 2000/ 3 1.6 0.4
Or using data.table
Here, the idea is similar to the above. First convert the data.frame to data.table using setDT. Create a new column Average:=CPI/n. Then use replicate rep the rownumbers of the dataset with n and use that numeric index to expand the rows of mydf
library(data.table)
setDT(mydf)[mydf[, Average:=CPI/n][,rep(seq_len(.N), each=n)]]
# date CPI Average
# 1: 2000/ 1 1.2 0.3
# 2: 2000/ 1 1.2 0.3
# 3: 2000/ 1 1.2 0.3
# 4: 2000/ 1 1.2 0.3
# 5: 2000/ 2 3.2 0.8
# 6: 2000/ 2 3.2 0.8
# 7: 2000/ 2 3.2 0.8
# 8: 2000/ 2 3.2 0.8
# 9: 2000/ 3 1.6 0.4
#10: 2000/ 3 1.6 0.4
#11: 2000/ 3 1.6 0.4
#12: 2000/ 3 1.6 0.4
If you need to separate the date in to year and quarter as shown in #KFB's post, you could use cSplit along with data.table. In the below code, setnames are used to rename the columns after the split. Rest of the procedure is the same as above.
Link to cSplit is https://gist.github.com/mrdwab/11380733
library(devtools)
source_gist(11380733)
DT1 <- setnames(cSplit(mydf, "date", '[/]', fixed=FALSE,direction='wide'),
c("CPI", "year", "Quarter"))
DT1[DT1[, Average:= CPI/n][,rep(seq_len(.N), each=n)]]
# CPI year Quarter Average
#1: 1.2 2000 1 0.3
#2: 1.2 2000 1 0.3
#3: 1.2 2000 1 0.3
#4: 1.2 2000 1 0.3
#5: 3.2 2000 2 0.8
#6: 3.2 2000 2 0.8
#7: 3.2 2000 2 0.8
#8: 3.2 2000 2 0.8
#9: 1.6 2000 3 0.4
#10: 1.6 2000 3 0.4
#11: 1.6 2000 3 0.4
#12: 1.6 2000 3 0.4
data
mydf <- structure(list(date = c("2000/ 1", "2000/ 2", "2000/ 3"), CPI = c(1.2,
3.2, 1.6)), .Names = c("date", "CPI"), class = "data.frame", row.names = c("1",
"2", "3"))

Another data.table solution using #akrun's mydf:
mydt = data.table(mydf)
mydt2 = mydt[,data.table(apply(.SD,2,function(x) rep(x,4))),]
mydt2$CPI = as.numeric(mydt2$CPI)
mydt2[,Average:=CPI/4,]
mydt2
date CPI Average
1: 2000/ 1 1.2 0.3
2: 2000/ 2 3.2 0.8
3: 2000/ 3 1.6 0.4
4: 2000/ 1 1.2 0.3
5: 2000/ 2 3.2 0.8
6: 2000/ 3 1.6 0.4
7: 2000/ 1 1.2 0.3
8: 2000/ 2 3.2 0.8
9: 2000/ 3 1.6 0.4
10: 2000/ 1 1.2 0.3
11: 2000/ 2 3.2 0.8
12: 2000/ 3 1.6 0.4

Related

How to count number of columns by condition on another column

I have a dataframe that looks like this:
data <- as.data.frame(cbind('01-01-2018' = c(1.2,3.1,0.7,-0.3,2.0), '02-01-2018' = c(-0.1, 2.4, 4.9,-3.3,-2.7), '03-01-2018' = c(3.4, -2.6, -1.8, 0.1, 0.3)))
01-01-2018 02-01-2018 03-01-2018
1 1.2 -0.1 3.4
2 3.1 2.4 -2.6
3 0.7 4.9 -1.8
4 -0.3 -3.3 0.1
5 2.0 -2.7 0.3
I want to count how many times per each row, a value is bigger than the average of the corresponding row.
data$mn <- apply(data, 1, mean)
01-01-2018 02-01-2018 03-01-2018 mn
1 1.2 -0.1 3.4 1.5000000
2 3.1 2.4 -2.6 0.9666667
3 0.7 4.9 -1.8 1.2666667
4 -0.3 -3.3 0.1 -1.1666667
5 2.0 -2.7 0.3 -0.1333333
My last attempt was the following:
df$events <- apply(data, 1, function(x) sum(x > data$mn))
uhi_events <- numeric(nrow(data))
for (i in 1:nrow(data)) {
uhi <- data[[6]][[i]][["values"]]
uhi_events[i] <- sum(uhi)
}
data$uhi_events <- uhi_events
Is there a more efficient option?
EDIT:
What if the condition is on another column, let's say data$c1, that is not obtained through a simple formula?
data$md <- apply(data, 1, median)
01-01-2018 02-01-2018 03-01-2018 md
1 1.2 -0.1 3.4 1.5000000
2 3.1 2.4 -2.6 0.9666667
3 0.7 4.9 -1.8 1.2666667
4 -0.3 -3.3 0.1 -1.1666667
5 2.0 -2.7 0.3 -0.1333333
Using rowMeans and rowSums:
data$cnt <- rowSums(data > rowMeans(data))
data
# 01-01-2018 02-01-2018 03-01-2018 cnt
# 1 1.2 -0.1 3.4 1
# 2 3.1 2.4 -2.6 2
# 3 0.7 4.9 -1.8 1
# 4 -0.3 -3.3 0.1 2
# 5 2.0 -2.7 0.3 2
If the column was already computed replace rowMeans with existing column data$c1:
#get index excluding "c1":
ix <- grep("c1", colnames(data), invert = TRUE)
data$cnt <- rowSums(data[, ix ] > data$c1)
Using a user defined function to sum from a logical operation (logical vector is coerced by sum() to an integer vector such that TRUE = 1 and FALSE = 0)
data$uhi_events <-
apply(data, 1, function(i){
sum(i>mean(i))
})
library(data.table)
setDT(data)
data[, above_mean := rowSums(.SD > rowMeans(.SD))]
# 01-01-2018 02-01-2018 03-01-2018 above_mean
# 1: 1.2 -0.1 3.4 1
# 2: 3.1 2.4 -2.6 2
# 3: 0.7 4.9 -1.8 1
# 4: -0.3 -3.3 0.1 2
# 5: 2.0 -2.7 0.3 2
edit for question in comments
compare to value in first column
data[, above_col1 := rowSums(.SD > `01-01-2018`)]
# 01-01-2018 02-01-2018 03-01-2018 above_col1
# 1: 1.2 -0.1 3.4 1
# 2: 3.1 2.4 -2.6 0
# 3: 0.7 4.9 -1.8 1
# 4: -0.3 -3.3 0.1 1
# 5: 2.0 -2.7 0.3 0
Using a dplyr approach:
library(dplyr)
data <- as.data.frame(cbind('01-01-2018' = c(1.2,3.1,0.7,-0.3,2.0), '02-01-2018' = c(-0.1, 2.4, 4.9,-3.3,-2.7), '03-01-2018' = c(3.4, -2.6, -1.8, 0.1, 0.3)))
data$mm <- apply(data,1,median)
data %>%
rowwise %>%
mutate(count = sum(c_across(1:3) > mm))
#> # A tibble: 5 × 5
#> # Rowwise:
#> `01-01-2018` `02-01-2018` `03-01-2018` mm count
#> <dbl> <dbl> <dbl> <dbl> <int>
#> 1 1.2 -0.1 3.4 1.2 1
#> 2 3.1 2.4 -2.6 2.4 1
#> 3 0.7 4.9 -1.8 0.7 1
#> 4 -0.3 -3.3 0.1 -0.3 1
#> 5 2 -2.7 0.3 0.3 1

Adding one day at each value change

I have a data.table with millions of rows in the following format.
There are multi-year results for each ID, however I only know the day of the year going from 1 to 365 or 366. I don't know the month nor the year, but I know the date for the first row (e.g. 1995/1/1).
ID DAY ATRR1 ATRR2
1 1 0.2 0.4
2 1 1.2 0.5
3 1 0.8 1.4
1 2 1.3 1.5
2 2 2.3 0.3
3 2 1.7 1.3
1 3 1.5 1.4
2 3 2.1 1.3
3 3 1.2 0.3
...
1 365 1.5 1.4
2 365 2.1 1.3
3 365 1.2 0.3
1 1 1.5 1.4
2 1 2.1 1.3
3 1 1.2 0.3
1 2 1.3 1.5
2 2 2.3 0.3
3 2 1.7 1.3
...
I would like to add a DATE column adding one day at each change in the DAY column, so the result would be:
ID DAY ATRR1 ATRR2 DATE
1 1 0.2 0.4 1995/1/1
2 1 1.2 0.5 1995/1/1
3 1 0.8 1.4 1995/1/1
1 2 1.3 1.5 1995/1/2
2 2 2.3 0.3 1995/1/2
3 2 1.7 1.3 1995/1/2
1 3 1.5 1.4 1995/1/3
2 3 2.1 1.3 1995/1/3
3 3 1.2 0.3 1995/1/3
...
1 365 1.5 1.4 1995/12/31
2 365 2.1 1.3 1995/12/31
3 365 1.2 0.3 1995/12/31
1 1 1.5 1.4 1996/1/1
2 1 2.1 1.3 1996/1/1
3 1 1.2 0.3 1996/1/1
1 2 1.3 1.5 1996/1/2
2 2 2.3 0.3 1996/1/2
3 2 1.7 1.3 1996/1/2
...
How would it be possible to do that?
You can simply do this:
as.Date(x, origin="1994-12-31")
My assumption here is that you don't have gaps in your dates and arranged as described in the question, otherwise this shall produce undesirable results.
Sample data:
df <- data.frame(Day = rep(c(1:365,1:2),each=3))
Create a seq like this using rle(run length encoding)
df$seq <- data.table::rleid(df$Day)
df$date <- as.Date(df$seq, origin="1994-12-31") #final answer
tail(df,8)
Let me know , if this is your expectation
Sample Output:
> tail(df,8)
Day seq date
1094 365 365 1995-12-31
1095 365 365 1995-12-31
1096 1 366 1996-01-01
1097 1 366 1996-01-01
1098 1 366 1996-01-01
1099 2 367 1996-01-02
1100 2 367 1996-01-02
1101 2 367 1996-01-02
date gaps no problem for this solution:
library(data.table)
library(lubridate)
library(magrittr)
read.table(text = "
ID DAY ATRR1 ATRR2
1 1 0.2 0.4
2 1 1.2 0.5
3 1 0.8 1.4
1 2 1.3 1.5
2 2 2.3 0.3
3 2 1.7 1.3
1 3 1.5 1.4
2 3 2.1 1.3
3 3 1.2 0.3
1 365 1.5 1.4
2 365 2.1 1.3
3 365 1.2 0.3
1 1 1.5 1.4
2 1 2.1 1.3
3 1 1.2 0.3
1 2 1.3 1.5
2 2 2.3 0.3
3 2 1.7 1.3", header = T) %>% setDT -> x
x[, date := as.Date(DAY, origin = "1995-01-01") -1]
x[, date := {
t1 = c(0, diff(DAY))
t2 = ifelse(t1 < 0, 1, 0)
t3 = cumsum(t2)
t4 = date + years(t3)
}]

Create lower triangle genetic distance matrix

I have distance matrix like this
1 2 3 4 5
A 0.1 0.2 0.3 0.5 0.6
B 0.7 0.8 0.9 1 1.1
C 1.2 1.3 1.4 1.5 1.6
D 1.7 1.8 1.9 2 2.1
E 2.2 2.3 2.4 2.5 2.6
and now I want to create lower triangle matrix like this
1 2 3 4 5 A B C D E
1 0
2 0.1 0
3 0.2 0.1 0
4 0.4 0.3 0.2 0
5 0.5 0.4 0.3 0.1 0
A 0.1 0.2 0.3 0.5 0.6 0
B 0.7 0.8 0.9 1 1.1 0.6 0
C 1.2 1.3 1.4 1.5 1.6 1.1 0.5 0
D 1.7 1.8 1.9 2 2.1 1.6 1 0.5 0
E 2.2 2.3 2.4 2.5 2.6 2.1 1.5 1 0.5 0
I just deducted distance between 2 from 1 from first table to get genetic distance between 1 and 2 (0.2 - 0.1=0.1) and like this I did for rest of the entries and I do not know doing like this is correct or not?, after doing calculation like that made lower triangle matrix. I tried like this in R
x <- read.csv("AD2.csv", head = FALSE, sep = ",")
b<-lower.tri(b, diag = FALSE)
but I am getting only TRUE and FALSE as output not like distance matrix.
can any one help to solve this problem and here is link to my example data.
You can make use of dist to calculate sub-matrices. Then use cbind and create the top and bottom half. Then rbind the 2 halves. Then set upper triangular to NA to create the desired output.
mat <- rbind(
cbind(as.matrix(dist(tbl[1,])), tbl),
cbind(tbl, as.matrix(dist(tbl[,1])))
)
mat[upper.tri(mat, diag=FALSE)] <- NA
mat
Hope it helps.
data:
tbl <- as.matrix(read.table(text="1 2 3 4 5
A 0.1 0.2 0.3 0.5 0.6
B 0.7 0.8 0.9 1 1.1
C 1.2 1.3 1.4 1.5 1.6
D 1.7 1.8 1.9 2 2.1
E 2.2 2.3 2.4 2.5 2.6", header=TRUE, check.names=FALSE, row.names=1))

Summing Values based on Hour and Month and Re-arranging Summed Time Series

I am trying to aggregate (sum) values across months and hours and re-arrange the summed data so that hour and month are on different "axes". I would like the hour to be column headers and the month to be row headers with summed values in each cell. Here's what I mean, through a dummy data example (obviously 12 months are present and 24 hours in the real data):
Month <- c(1,1,2,2,3,3,3,4,4,4,5,5,5,5,6,7,8,9,10,11,12)
Hour <- c(4,1,3,2,5,5,1,4,3,6,0,0,2,3,1,2,3,4,5,6,2)
Value <- c(0.1,0.4,0.02,0.1,0.1,0.2,0.02,0.01,0.01,0.02,0.1,0.3,0.2,0.1,0.2, 0.1,0.3,0.1,0.01,0.01,0.1)
z <- data.frame(Month, Hour, Value)
head(z)
Month Hour Value
1 4 0.10
1 1 0.40
2 3 0.02
2 2 0.10
3 5 0.10
3 5 0.20
My desired output, Hour = column headers (there will be 24 total, this just shows first 6 hours), Month = row headers (there will be 12 total)
z
0 1 2 3 4 5 6
1 0.3 0.2 0.1 0.7 0.1 1.1 0.7
2 0.1 0.1 0.8 1.7 0.2 0.1 0.6
3 0.2 0.7 0.1 0.4 2.1 1.3 0.1
4 0.1 0.2 0.2 0.1 3.1 0.1 0.7
5 0.7 0.8 1.2 0.2 0.4 0.1 0.2
6 0.5 0.2 3.0 0.8 0.2 5.1 1.2
7 0.5 0.2 3.0 0.8 0.2 5.1 1.2
8 0.5 0.2 3.0 0.8 0.2 5.1 1.2
9 0.5 0.2 3.0 0.8 0.2 5.1 1.2
10 0.5 0.2 3.0 0.8 0.2 5.1 1.2
11 0.5 0.2 3.0 0.8 0.2 5.1 1.2
12 0.5 0.2 3.0 0.8 0.2 5.1 1.2
We can use xtabs to create a contingency table
xtabs(Value~Month+Hour)

Reshape matrix to data frame

I have association matrix file that looks like this (4 rows and 3 columns) .
test=read.table("test.csv", sep=",", header=T)
head(test)
LosAngeles SanDiego Seattle
1 2 3
A 1 0.1 0.2 0.2
B 2 0.2 0.4 0.2
C 3 0.3 0.5 0.3
D 4 0.2 0.5 0.1
What I want to is reshape this matrix file into data frame. The result should look something like this (12(= 4 * 3) rows and 3 columns):
RowNum ColumnNum Value
1 1 0.1
2 1 0.2
3 1 0.3
4 1 0.2
1 2 0.2
2 2 0.4
3 2 0.5
4 2 0.5
1 3 0.2
2 3 0.2
3 3 0.3
4 3 0.1
That is, if my matrix file has 100 rows and 90 columns. I want to make new data frame file that contains 9000 (= 100 * 90) rows and 3 columns. I've tried to use reshape package but but I do not seem to be able to get it right. Any suggestions how to solve this problem?
Use as.data.frame.table. Its the boss:
m <- matrix(data = c(0.1, 0.2, 0.2,
0.2, 0.4, 0.2,
0.3, 0.5, 0.3,
0.2, 0.5, 0.1),
nrow = 4, byrow = TRUE,
dimnames = list(row = 1:4, col = 1:3))
m
# col
# row 1 2 3
# 1 0.1 0.2 0.2
# 2 0.2 0.4 0.2
# 3 0.3 0.5 0.3
# 4 0.2 0.5 0.1
as.data.frame.table(m)
# row col Freq
# 1 1 1 0.1
# 2 2 1 0.2
# 3 3 1 0.3
# 4 4 1 0.2
# 5 1 2 0.2
# 6 2 2 0.4
# 7 3 2 0.5
# 8 4 2 0.5
# 9 1 3 0.2
# 10 2 3 0.2
# 11 3 3 0.3
# 12 4 3 0.1
This should do the trick:
test <- as.matrix(read.table(text="
1 2 3
1 0.1 0.2 0.2
2 0.2 0.4 0.2
3 0.3 0.5 0.3
4 0.2 0.5 0.1", header=TRUE))
data.frame(which(test==test, arr.ind=TRUE),
Value=test[which(test==test)],
row.names=NULL)
# row col Value
#1 1 1 0.1
#2 2 1 0.2
#3 3 1 0.3
#4 4 1 0.2
#5 1 2 0.2
#6 2 2 0.4
#7 3 2 0.5
#8 4 2 0.5
#9 1 3 0.2
#10 2 3 0.2
#11 3 3 0.3
#12 4 3 0.1

Resources