Create id with a pair of two sequenced number - r

In my data, I have a year-week column and hope to create a variable with combining two weeks.
for example, here is my column x,
x = c(201336, 201336, 201336, 201337, 201337, 201340, 201341, 201341, 201342, 201343, 201344, ...)
In x, 201336 means 36th week of 2013, 201337 means 37th week of 2013 and so on. I want to indicate that
(201336,201337) -> 1
(201338,201339) -> 2
(201340,201341) -> 3
(201342,201343) -> 4
(201344,201345) -> 5
and so on
So, My desired vector is that
x2 = c(1,1,1,1,1,3,3,3,4,4,5,...)

if there is only one year as #Psidom mentioned above, it seems to be:
(x - min(x))%/%2 + 1
[update]
If your vector contains entries for different years:
set.seed(112358)
x <- sample(1992:2017,1000, replace = T)
x <- as.integer((x + sample(seq(.1,.52,length.out=length(1992:2017)),
1000,replace = T)
)*100)
x <- x[order(x)]
names(x) <- x%/%100
head(x, 11)
#1992 1992 1992 1992 1992 1992 1992 1992 1992 1992 1992
#199210 199211 199213 199213 199215 199216 199220 199220 199220 199221 199221
tail(x, 11)
#2017 2017 2017 2017 2017 2017 2017 2017 2017 2017 2017
#201736 201738 201740 201740 201741 201741 201743 201745 201746 201750 201752
foo <- function(x){
return((x-min(x))%/%2+1)
}
xm <- unlist(sapply(unique(names(x)), function(i) foo(x[names(x) == i])), use.names = F)
mdf <- data.frame(original = x, modified = xm)
head(mdf)
# original modified
#1 199210 1
#2 199211 1
#3 199213 2
#4 199213 2
#5 199215 3
#6 199216 4
tail(mdf)
# original modified
#995 201741 16
#996 201743 17
#997 201745 18
#998 201746 19
#999 201750 21
#1000 201752 22

This should work for you:
x = c(201336, 201336, 201336, 201337, 201337, 201340, 201341, 201341, 201342, 201343, 201344)
as.integer((x - 201336) / 2) + 1
# [1] 1 1 1 1 1 3 3 3 4 4 5

Related

How to use a loop to create panel data by subsetting and merging a lot of different data frames in R?

I've looked around but I can't find an answer to this!
I've imported a large number of datasets to R.
Each dataset contains information for a single year (ex. df_2012, df_2013, df_2014 etc).
All the datasets have the same variables/columns (ex. varA_2012 in df_2012 corresponds to varA_2013 in df_2013).
I want to create a df with my id variable and varA_2012, varB_2012, varA_2013, varB_2013, varA_2014, varB_2014 etc
I'm trying to create a loop that helps me extract the few columns that I'm interested in (varA_XXXX, varB_XXXX) in each data frame and then do a full join based on my id var.
I haven't used R in a very long time...
So far, I've tried this:
id <- c("France", "Belgium", "Spain")
varA_2012 <- c(1,2,3)
varB_2012 <- c(7,2,9)
varC_2012 <- c(1,56,0)
varD_2012 <- c(13,55,8)
varA_2013 <- c(34,3,56)
varB_2013 <- c(2,53,5)
varC_2013 <- c(24,3,45)
varD_2013 <- c(27,13,8)
varA_2014 <- c(9,10,5)
varB_2014 <- c(95,30,75)
varC_2014 <- c(99,0,51)
varD_2014 <- c(9,40,1)
df_2012 <-data.frame(id, varA_2012, varB_2012, varC_2012, varD_2012)
df_2013 <-data.frame(id, varA_2013, varB_2013, varC_2013, varD_2013)
df_2014 <-data.frame(id, varA_2014, varB_2014, varC_2014, varD_2014)
year = c(2012:2014)
for(i in 1:length(year)) {
df_[i] <- df_[I][df_[i]$id, df_[i]$varA_[i], df_[i]$varB_[i], ]
list2env(df_[i], .GlobalEnv)
}
panel_df <- Reduce(function(x, y) merge(x, y, by="if"), list(df_2012, df_2013, df_2014))
I know that there are probably loads of errors in here.
Here are a couple of options; however, it's unclear what you want the expected output to look like.
If you want a wide format, then we can use tidyverse to do:
library(tidyverse)
results <-
map(list(df_2012, df_2013, df_2014), function(x)
x %>% dplyr::select(id, starts_with("varA"), starts_with("varB"))) %>%
reduce(., function(x, y)
left_join(x, y, all = TRUE, by = "id"))
Output
id varA_2012 varB_2012 varA_2013 varB_2013 varA_2014 varB_2014
1 Belgium 2 2 3 53 10 30
2 France 1 7 34 2 9 95
3 Spain 3 9 56 5 5 75
However, if you need it in a long format, then we could pivot the data:
results %>%
pivot_longer(-id, names_to = c("variable", "year"), names_sep = "_")
Output
id variable year value
<chr> <chr> <chr> <dbl>
1 France varA 2012 1
2 France varB 2012 7
3 France varA 2013 34
4 France varB 2013 2
5 France varA 2014 9
6 France varB 2014 95
7 Belgium varA 2012 2
8 Belgium varB 2012 2
9 Belgium varA 2013 3
10 Belgium varB 2013 53
11 Belgium varA 2014 10
12 Belgium varB 2014 30
13 Spain varA 2012 3
14 Spain varB 2012 9
15 Spain varA 2013 56
16 Spain varB 2013 5
17 Spain varA 2014 5
18 Spain varB 2014 75
Or if using base R for the wide format, then we can do:
results <-
lapply(list(df_2012, df_2013, df_2014), function(x)
subset(x, select = c("id", names(x)[startsWith(names(x), "varA")], names(x)[startsWith(names(x), "varB")])))
results <-
Reduce(function(x, y)
merge(x, y, all = TRUE, by = "id"), results)
From your initial for loop attempt, it seems the code below may help
> (df <- Reduce(merge, list(df_2012, df_2013, df_2014)))[grepl("^(id|var(A|B))",names(df))]
id varA_2012 varB_2012 varA_2013 varB_2013 varA_2014 varB_2014
1 Belgium 2 2 3 53 10 30
2 France 1 7 34 2 9 95
3 Spain 3 9 56 5 5 75

Euclidean distant for distinct classes of factors iterated by groups

*Update: The answer suggested by Rui is great and works as it should. However, when I run it on about 7 million observations (my actual dataset), R gets stuck in a computational block (I'm using a machine with 64gb of RAM). Any other solutions are greatly appreciated!
I have a dataframe of patents consisting of the firms, application years, patent number, and patent classes. I want to calculate the Euclidean distance between consecutive years for each firm based on patent classes according to the following formula:
Where Xi represents the number of patents belonging to a specific class in year t, and Yi represents the number of patents belonging to a specific class in the previous year (t-1).
To further illustrate this, consider the following dataset:
df <- data.table(Firm = rep(c(LETTERS[1:2]),each=6), Year = rep(c(1990,1990,1991,1992,1992,1993),2),
Patent_Number = sample(184785:194785,12,replace = FALSE),
Patent_Class = c(12,5,31,12,31,6,15,15,15,3,3,1))
> df
Firm Year Patent_Number Patent_Class
1: A 1990 192473 12
2: A 1990 193702 5
3: A 1991 191889 31
4: A 1992 193341 12
5: A 1992 189512 31
6: A 1993 185582 6
7: B 1990 190838 15
8: B 1990 189322 15
9: B 1991 190620 15
10: B 1992 193443 3
11: B 1992 189937 3
12: B 1993 194146 1
Since year 1990 is the beginning year for Firm A, there is no Euclidean distance for that year (NAs should be produced. Moving forward to year 1991, the distinct classses for this year (1991) and the previous year (1990) are 31, 5, and 12. Therefore, the above formula is summed over these three distinct classes (there is three distinc 'i's). So the formula's output will be:
Following the same calculation and reiterating over firms, the final output should be:
> df
Firm Year Patent_Number Patent_Class El_Dist
1: A 1990 192473 12 NA
2: A 1990 193702 5 NA
3: A 1991 191889 31 1.2247450
4: A 1992 193341 12 0.7071068
5: A 1992 189512 31 0.7071068
6: A 1993 185582 6 1.2247450
7: B 1990 190838 15 NA
8: B 1990 189322 15 NA
9: B 1991 190620 15 0.5000000
10: B 1992 193443 3 1.1180340
11: B 1992 189937 3 1.1180340
12: B 1993 194146 1 1.1180340
I'm preferably looking for a data.table solution for speed purposes.
Thank you very much in advance for any help.
I believe that the function below does what the question asks for, but the results for Firm == "B" are not equal to the question's.
fEl_Dist <- function(X){
Year <- X[["Year"]]
PatentClass <- X[["Patent_Class"]]
sapply(seq_along(Year), function(i){
j <- which(Year %in% (Year[i] - 1:0))
tbl <- table(Year[j], PatentClass[j])
if(NROW(tbl) == 1){
NA_real_
} else {
numer <- sum((tbl[2, ] - tbl[1, ])^2)
denom <- sum(tbl[2, ]^2)*sum(tbl[1, ]^2)
sqrt(numer/denom)
}
})
}
setDT(df)[, El_Dist := fEl_Dist(.SD),
by = .(Firm),
.SDcols = c("Year", "Patent_Class")]
head(df)
# Firm Year Patent_Number Patent_Class El_Dist
#1: A 1990 190948 12 NA
#2: A 1990 186156 5 NA
#3: A 1991 190801 31 1.2247449
#4: A 1992 185226 12 0.7071068
#5: A 1992 185900 31 0.7071068
#6: A 1993 186928 6 1.2247449

arrange one below the other every 2 columns from data frame in R

Hi I have a df as below which show date and their respected
date 1_val date 2_val . . . . date n_val
2014 23 2014 33 . . . . 2014 34
2015 22 2016 12 . . . . 2016 99
i have tried with hard coding to arrange the columns one below the other
for 1&2 columns
a=1
b=2
names_2<-df[,c(a,b)]
colnames(names_2)[1]<-"Date"
names_2 <- names_2[!apply(is.na(names_2) | names_2 == "", 1, all),]
names_2<-melt(names_2,id=colnames(names_2)[1])
samp_out<-names_2
for 3&4 columns
a=3
b=4
names_2<-df[,c(a,b)]
colnames(names_2)[1]<-"Date"
names_2 <- names_2[!apply(is.na(names_2) | names_2 == "", 1, all),]
names_2<-melt(names_2,id=colnames(names_2)[1])
samp_out1<-names_2
till n-numbers
df1= rbind(samp_out,samp_out1,......samp_out_n)
output
date variable value
2014 1_val 23
2015 1_val 22
2014 2_val 33
2016 2_val 12
.
.
2014 n_val 34
2016 n_val 99
Thanks in advance
The function melt in the package data.table does that:
melt(df, id = "Date", measure = patterns("_val"))
You can specify the name of the variable to pivot on (Date in this case) and a pattern in the variables you want to keep the values of. You can also supply a vector with all the variablenames instead.
> DT <- data.table(Date = c(2014,2013), `1_val` = c(33, 32), Date = c(2014, 2013), `2_val` = c(65, 34))
> DT
Date 1_val Date 2_val
1: 2014 33 2014 65
2: 2013 32 2013 34
> melt(DT, id = "Date", measure = patterns("_val"))
Date variable value
1: 2014 1_val 33
2: 2013 1_val 32
3: 2014 2_val 65
4: 2013 2_val 34
You can use stack from base R,
setNames(data.frame(stack(df[c(TRUE, FALSE)])[1],
stack(df[c(FALSE, TRUE)])),
c('date', 'value', 'variable'))
# date value variable
#1 2014 33 1_val
#2 2013 32 1_val
#3 2014 65 2_val
#4 2013 34 2_val
Define the untidy rectangle
library(magrittr)
csv <- "date,1_val,date,2_val,date,3_val
2014,23,2014,33,2014,34
2015,22,2016,12,2016,99"
Read into a data frame, then transform into a long/eav rectangle.
ds_eav <- csv %>%
readr::read_csv() %>%
tibble::rownames_to_column(var="height") %>%
tidyr::gather(key=key, value=value, -height)
output:
# A tibble: 12 x 4
key index value height
<chr> <int> <int> <int>
1 date 1 2014 1
2 date 1 2015 2
3 value 1 23 1
4 value 1 22 2
5 date 2 2014 1
6 date 2 2016 2
7 value 2 33 1
8 value 2 12 2
9 date 3 2014 1
10 date 3 2016 2
11 value 3 34 1
12 value 3 99 2
Identify which rows are dates/values. Then shift up dates' index by 1.
ds_eav <- ds_eav %>%
dplyr::mutate(
index_val = sub("^(\\d+)_val$" , "\\1", key),
index_date = sub("^date_(\\d+)$", "\\1", key),
index_date = dplyr::if_else(key=="date", "0", index_date),
key = dplyr::if_else(grepl("^date(_\\d+)*", key), "date", "value"),
index = dplyr::if_else(key=="date", index_date, index_val),
index = as.integer(index),
index = index + dplyr::if_else(key=="date", 1L, 0L)
) %>%
dplyr::select(key, index, value, height)
Follow the advice of #jarko-dubbeldam and use spread/gather on the last step too
ds_eav %>%
tidyr::spread(key=key, value=value)
output:
# A tibble: 6 x 4
index height date value
* <int> <int> <int> <int>
1 1 1 2014 23
2 1 2 2015 22
3 2 1 2014 33
4 2 2 2016 12
5 3 1 2014 34
6 3 2 2016 99
You can use paste0(index, "_val") to get you exact output. But I'd prefer to keep them as integers, so you can do math on them in necessary (eg, max()).
edit 1: incorporate the advice & corrections of #jarko-dubbeldam and #hnskd.
edit 2: use rownames_to_column() in case the input isn't a balanced rectangle (eg, one column doesn't all all the rows).

Conditional cumulative subtraction

This is what my data.table looks like:
library(data.table)
dt <- fread('
Year Total Shares Balance
2017 10 1 10
2016 12 2 9
2015 10 2 7
2014 10 3 6
2013 10 NA 3
')
**Balance** is my desired column. I am trying to find the cumulative subtractions by taking the first value of Total which is 10(it should also be the first value of Balance field) and then cumulatively subtracting values in Shares. So the second value is 10-1 =9 and the third value is 9-2 = 7 and such. There is one condition, if the Year is 2014, then subtract the Shares value after dividing it by 2. so the fourth value is 7-(2/2)=6 and the fifth value is 6-3=3. I want to end the calc as of the last row.
My attempt is:
dt[, Balance:= ifelse( Year == 2014, cumsum(Total[1]-Shares/2), cumsum(Total[1] - Shares))]
Here is one method.
dt[, Balance2 := Total[1] - cumsum(shift(Shares * (1 - (0.5 *(Year == 2015))), fill=0))]
shift is used to create a lag variable, and the first element is filled with 0, using fill=0. The other elements are calculated as Shares * (1 - (0.5 *(Year == 2015))) which return Shares except when Years == 2015, in which case Shares * 0.5 is returned.
which returns
dt
Year Total Shares Balance Balance2
1: 2017 10 1 10 10
2: 2016 12 2 9 9
3: 2015 10 2 7 7
4: 2014 10 3 6 6
5: 2013 10 NA 3 3
FWIW, I wanted to provide a functional alternative that would allow for more flexible calculations in the cumulative differences, indexing, etc. I also have read in the data with read.table.
dt <- read.table(header=TRUE, text='
Year Total Shares Balance
2017 10 1 10
2016 12 2 9
2015 10 2 7
2014 10 3 6
2013 10 NA 3
')
makeNewBalance <- function(dt) {
output <- NULL
for (i in 1:nrow(dt)) {
if (i==1) {
output[i] <- dt$Total[i]
} else {
output[i] <- output[i-1] - as.integer(ifelse(dt$Year[i]==2014,
dt$Shares[i-1]/2,
dt$Shares[i-1]))
}
}
return(output)
}
dt$NewBalance <- makeNewBalance(dt)
which also returns
> dt
Year Total Shares Balance NewBalance
1 2017 10 1 10 10
2 2016 12 2 9 9
3 2015 10 2 7 7
4 2014 10 3 6 6
5 2013 10 NA 3 3

R Cleaning and reordering names/serial numbers in data frame

Let's say I have a data frame as follows in R:
Data <- data.frame("SerialNum" = character(), "Year" = integer(), "Name" = character(), stringsAsFactors = F)
Data[1,] <- c("983\n837\n424\n ", 2015, "Michael\nLewis\nPaul\n ")
Data[2,] <- c("123\n456\n789\n136", 2014, "Elaine\nJerry\nGeorge\nKramer")
Data[3,] <- c("987\n654\n321\n975\n ", 2010, "John\nPaul\nGeorge\nRingo\nNA")
Data[4,] <- c("424\n983\n837", 2015, "Paul\nMichael\nLewis")
Data[5,] <- c("456\n789\n123\n136", 2014, "Jerry\nGeorge\nElaine\nKramer")
What I want to do is the following:
Split up each string of names and each string of serial numbers so that they are their own vectors (or a list of string vectors).
Eliminate any character "NA" in either set of vectors or any blank spaces denoted by "...\n ".
Reorder each list of names alphabetically and reorder the corresponding serial numbers according to the same permutation.
Concatenate each vector in the same fashion it was originally (I usually do this with paste(., collapse = "\n")).
My issue is how to do this without using a for loop. What is an object-oriented way to do this? As a first attempt in this direction I originally made a list by the command LIST <- strsplit(Data$Name, split = "\n") and from here I need a for loop in order to find the permutations of the names, which seems like a process that won't scale according to my actual data. Additionally, once I make the list LIST I'm not sure how I go about removing NA symbols or blank spaces. Any help is appreciated!
Using lapply I take each row of the data frame and turn it into a new data frame with one name per row. This creates a list of 5 data frames, one for each row of the original data frame.
seinfeld = lapply(1:nrow(Data), function(i) {
# Turn strings into data frame with one name per row
dat = data.frame(SerialNum=unlist(strsplit(Data[i,"SerialNum"], split="\n")),
Year=Data[i,"Year"],
Name=unlist(strsplit(Data[i,"Name"], split="\n")))
# Get rid of empty strings and NA values
dat = dat[!(dat$Name %in% c(""," ","NA")), ]
# Order alphabetically
dat = dat[order(dat$Name), ]
})
UPDATE: Based on your comment, let me know if this is the result you're trying to achieve:
seinfeld = lapply(1:nrow(Data), function(i) {
# Turn strings into data frame with one name per row
dat = data.frame(SerialNum=unlist(strsplit(Data[i,"SerialNum"], split="\n")),
Name=unlist(strsplit(Data[i,"Name"], split="\n")))
# Get rid of empty strings and NA values
dat = dat[!(dat$Name %in% c(""," ","NA")), ]
# Order alphabetically
dat = dat[order(dat$Name), ]
# Collapse back into a single row with the new sort order
dat = data.frame(SerialNum=paste(dat[, "SerialNum"], collapse="\n"),
Year=Data[i, "Year"],
Name=paste(dat[, "Name"], collapse="\n"))
})
do.call(rbind, seinfeld)
SerialNum Year Name
1 837\n983\n424 2015 Lewis\nMichael\nPaul
2 123\n789\n456\n136 2014 Elaine\nGeorge\nJerry\nKramer
3 321\n987\n654\n975 2010 George\nJohn\nPaul\nRingo
4 837\n983\n424 2015 Lewis\nMichael\nPaul
5 123\n789\n456\n136 2014 Elaine\nGeorge\nJerry\nKramer
eipi10 offered a great answer. In addition to that, I'd like to leave what I tried mainly with data.table. First, I split two columns (i.e., SerialNum and Name) with cSplit(), added an index with add_rownames(), and split the data by the index. In the first lapply(), I used Stacked() from the splitstackshape package. I stacked SerialNum and Name; separated SeriaNum and Name become two columns, as you see in a part of temp2. In the second lapply(), I used merge from the data.table package. Then, I removed rows with NAs (lapply(na.omit)), combined all data tables (rbindlist), and changed order of rows by rowname, which is row number of the original data) and Name (setorder(rowname, Name))
library(data.table)
library(splitstackshape)
library(dplyr)
cSplit(mydf, c("SerialNum", "Name"), direction = "wide",
type.convert = FALSE, sep = "\n") %>%
add_rownames %>%
split(f = .$rowname) -> temp
#a part of temp
#$`1`
#Source: local data frame [1 x 12]
#
#rowname Year SerialNum_1 SerialNum_2 SerialNum_3 SerialNum_4 SerialNum_5 Name_1 Name_2
#(chr) (dbl) (chr) (chr) (chr) (chr) (chr) (chr) (chr)
#1 1 2015 983 837 424 NA NA Michael Lewis
#Variables not shown: Name_3 (chr), Name_4 (chr), Name_5 (chr)
lapply(temp, function(x){
Stacked(x, var.stubs = c("SerialNum", "Name"), sep = "_")
}) -> temp2
# A part of temp2
#$`1`
#$`1`$SerialNum
# rowname Year .time_1 SerialNum
#1: 1 2015 1 983
#2: 1 2015 2 837
#3: 1 2015 3 424
#4: 1 2015 4 NA
#5: 1 2015 5 NA
#
#$`1`$Name
# rowname Year .time_1 Name
#1: 1 2015 1 Michael
#2: 1 2015 2 Lewis
#3: 1 2015 3 Paul
#4: 1 2015 4 NA
#5: 1 2015 5 NA
lapply(1:nrow(mydf), function(x){
merge(temp2[[x]]$SerialNum, temp2[[x]]$Name, by = c("rowname", "Year", ".time_1"))
}) %>%
lapply(na.omit) %>%
rbindlist %>%
setorder(rowname, Name) -> out
print(out)
# rowname Year .time_1 SerialNum Name
# 1: 1 2015 2 837 Lewis
# 2: 1 2015 1 983 Michael
# 3: 1 2015 3 424 Paul
# 4: 2 2014 1 123 Elaine
# 5: 2 2014 3 789 George
# 6: 2 2014 2 456 Jerry
# 7: 2 2014 4 136 Kramer
# 8: 3 2010 3 321 George
# 9: 3 2010 1 987 John
#10: 3 2010 2 654 Paul
#11: 3 2010 4 975 Ringo
#12: 4 2015 3 837 Lewis
#13: 4 2015 2 983 Michael
#14: 4 2015 1 424 Paul
#15: 5 2014 3 123 Elaine
#16: 5 2014 2 789 George
#17: 5 2014 1 456 Jerry
#18: 5 2014 4 136 Kramer
DATA
mydf <- structure(list(SerialNum = c("983\n837\n424\n ", "123\n456\n789\n136",
"987\n654\n321\n975\n ", "424\n983\n837", "456\n789\n123\n136"
), Year = c(2015, 2014, 2010, 2015, 2014), Name = c("Michael\nLewis\nPaul\n ",
"Elaine\nJerry\nGeorge\nKramer", "John\nPaul\nGeorge\nRingo\nNA",
"Paul\nMichael\nLewis", "Jerry\nGeorge\nElaine\nKramer")), .Names = c("SerialNum",
"Year", "Name"), row.names = c(NA, -5L), class = "data.frame")

Resources