Reshape data frame for consecutive years - r

I have data about thousands of customers who visited stores in the 3 past years.
For each customer, I have :
ID
Combination of a year and the first store visited in this year.
Customer_Id | Year_*_Store
1 2010_A
1 2011_B
1 2012_C
2 2010_A
2 2011_B
2 2012_D
What I’d like to have is the following structure of data in order to visualize the evolution of the customers’behaviour with a riverplot( aka Sankey plot)
For instance the 2 customers, who firstly visited the store A in 2010, firstly visited the store B in 2011:
SOURCE | TARGET | NB_CUSTOMERS
2010_A 2011_B 2
2011_B 2012_C 1
2011_B 2012_D 1
I don't want links between two years which are not consecutive like 2010_A and 2012_D
How can I do that in R ?

I would do this with dplyr (faster)
df<-read.table(header=T,text="Customer_Id Year_Store
1 2010_A
1 2011_B
1 2012_C
2 2010_A
2 2011_B
2 2012_D")
require(dplyr) # for aggregation
require(riverplot) # for Sankey
targets<-
group_by(df,Customer_Id) %.% # group by Customer
mutate(source=Year_Store,target=c(as.character(Year_Store)[-1],NA)) %.% # add a lag to show the shift
filter(!is.na(target)) %.% # filter out empty edges
regroup(list("source","target")) %.% # regroup by source & target
summarise(len=length(Customer_Id)) %.% # count customers for relationship
mutate(step=as.integer(substr(target,1,4))-as.integer(substr(source,1,4))) %.% # add a step to show how many years
filter(step==1) # filter out relationships for non consec years
topnodes <- c(as.character(unique(df$Year_Store))) # unique nodes
nodes <- data.frame( ID=topnodes, # IDs
x=as.numeric(substr(topnodes,1,4)), # x value for plot
col= rainbow(length(topnodes)), # color each different
labels= topnodes, # labels
stringsAsFactors= FALSE )
edges<- # create list of list
lapply(unique(targets$source),function(x){
l<-as.list(filter(targets,source==x)$len) # targets per source
names(l)<-filter(targets,source==x)$target # name of target
l
})
names(edges)<-unique(targets$source) # name top level nodes
r <- makeRiver( nodes, edges) # make the River
plot( r ) # plot it!

Note that you can't have a * in column names (see ?make.names). Here is a basic approach:
Split Year_store into two separate columns Year and Store in your data frame; at the moment it contains two completely different kinds of data and you actually need to process them separately.
Make a NextYear column, defined as Year + 1
Make a NextStore column, in which you assign the store code matching Customer_Id and for which Year is the same as this row's NextYear, assigning NA if there is no record of the customer visiting a store the next year, and throwing an error if the data do not meet the required specification (are ambiguous about which store was visited first the next year).
Strip out any of the rows in which NextStore is NA, and combine the NextYear and NextStore columns into a NextYear_NextStore column.
Summarize your data frame by the Year_store and NextYear_NextStore columns e.g. using ddply in the plyr package.
Some sample data:
# same example data as question
customer.df <- data.frame(Customer_Id = c(1, 1, 1, 2, 2, 2),
Year_Store = c("2010_A", "2011_B", "2012_C", "2010_A", "2011_B", "2012_D"),
stringsAsFactors = FALSE)
# alternative data should throw error, customer 2 is inconsistent in 2011
badCustomer.df <- data.frame(Customer_Id = c(1, 1, 1, 2, 2, 2),
Year_Store = c("2010_A", "2011_B", "2012_C", "2010_A", "2011_B", "2011_D"),
stringsAsFactors = FALSE)
And an implementation:
require(plyr)
splitYearStore <- function(df) {
df$Year <- as.numeric(substring(df$Year_Store, 1, 4))
df$Store <- as.character(substring(df$Year_Store, 6))
return(df)
}
findNextStore <- function(df, matchCust, matchYear) {
matchingStore <- with(df,
df[Customer_Id == matchCust & Year == matchYear, "Store"])
if (length(matchingStore) == 0) {
return(NA)
} else if (length(matchingStore) > 1) {
errorString <- paste("Inconsistent store results for customer",
matchCust, "in year", matchYear)
stop(errorString)
} else {
return(matchingStore)
}
}
tabulateTransitions <- function(df) {
df <- splitYearStore(df)
df$NextYear <- df$Year + 1
df$NextStore <- mapply(findNextStore, matchCust = df$Customer_Id,
matchYear = df$NextYear, MoreArgs = list(df = df))
df$NextYear_NextStore <- with(df, paste(NextYear, NextStore, sep = "_"))
df <- df[!is.na(df$NextStore),]
df <- ddply(df, .(Source = Year_Store, Target = NextYear_NextStore),
summarise, No_Customers = length(Customer_Id))
return(df)
}
Results:
> tabulateTransitions(customer.df)
Source Target No_Customers
1 2010_A 2011_B 2
2 2011_B 2012_C 1
3 2011_B 2012_D 1
> tabulateTransitions(badCustomer.df)
Error in function (df, matchCust, matchYear) :
Inconsistent store results for customer 2 in year 2011
No attempt has been made to optimise; if your data set is massive then perhaps you should investigate a data.table solution.

Related

A question about looping and creating/joining tables

My code is meant to order a table called Football (imported csv2) and then, using a for loop, go through the data and return the row number of the start year and end year.
Football[order(Football$Year),]
start_year <- min(Football$Year)
end_year <- max(Football$Year)
for (i in 1:nrow(Football)
{
if (Football$Year[i] = start_year)
{
row_of_start <- i
}
if (Football$Year[i] = end_year)
{
row_of_end <- i
}
}
This produces the following error:
> if (Football$Year[1] = start_year) row_of_start <- 1
Error: unexpected '=' in "if (Football$Year[1] ="
I appreciate there are probably ways of doing this without a for loop (which I would be very appreciative to know) although I would also like to know how to make the for loop work (to further my understanding).
You can skip the loop entirely using which(). This will usually be faster and more legible:
# Create example data
set.seed(123)
Football <- data.frame(Year = sample(1990:2000, size = 10),
foo = sample(letters, size = 10))
# Sort the data as you have done
Football_sort <- Football[order(Football$Year), ]
# Get the row numbers of the min and max (start and end years)
which(with(Football_sort, Year == min(Year)))
#> [1] 1
which(with(Football_sort, Year == max(Year)))
#> [1] 10
Depending upon what you actually want to do, you can skip the ordering step as well. Both of the below depend upon the dplyr package to work.
If you just want the start and end year rows rather than their row numbers:
library(dplyr)
Football %>%
filter(Year %in% c(min(Year), max(Year)))
#> Year foo
#> 1 2000 e
#> 2 1990 d
If you want the "year number" of the start and end year:
Football %>%
summarise(start_year = 1,
end_year = max(Year) - min(Year))
#> start_year end_year
#> 1 1 10

Creating new variables for multiple data frames in a for loop

I have 8 data frames and I want to create a variable for each of this data frame. I use a for a loop and the code I have used is given below:
year <- 2001
dflist <- list(bhps01, bhps02, bhps03, bhps04, bhps05, bhps06, bhps07, bhps08)
for (df in dflist){
df[["year"]] <- as.character(year)
assign()
year <- year + 1
}
bhps01,...,bhps08 are the data frame objects and year is a character variable. bhps01 is the data frame for year 2001, bhps02 is the data frame for year 2002 and so on.
Each data corresponds to a year, so bhps01 corresponds to year 2001, bhps corresponds to 2002 and so on. So, I want to create a year variable for each one of these data. So, year variable would be "2001" for bhps01 data, "2002" for bhps02 and so on.
The code runs fine but it does not create the variable year for either of the data frames except the local variable df.
Can someone please explain the error in the above code? Or is there an alternative of doing the same thing?
The syntax in the for loop is wrong. I am not entirely sure what you try to accomplish but let us try this
year = 2001
A = data.frame(a = c(1, 1), b = c(2, 2))
B = data.frame(a = c(1, 1), b = c(2, 2))
L = list(A, B)
for (i in seq_along(L)) {
L[[i]][, dim(L[[i]])[2] + 1] = as.character(rep(year,dim(L[[i]])[1]))
year = year + 1
}
with output
> L
[[1]]
a b V3
1 1 2 2001
2 1 2 2001
[[2]]
a b V3
1 1 2 2002
2 1 2 2002
That is what you intend as output, correct?
In order to change the column name to "year" you can do
L = lapply(L, function(x) {colnames(x)[3] = "year"; x})
You take a copy of the dataframe from the list, and add the variable "year" to it, but then do not assign it anywhere, which is why it is discarded (i.e. not stored in a variable). Here's a fix:
year <- 2001
dflist <- list(bhps01, bhps02, bhps03, bhps04, bhps05, bhps06, bhps07, bhps08)
counter <- 0
for (df in dflist){
counter <- counter + 1
df[["year"]] <- as.character(year)
dflist[[counter]] <- df
year <- year + 1
}
If you want the original dataframes to be edited, you could assign the result back on the rather then into the list. This is a bit of an indirect route, and notice the change in creating the dflist with names. We create the df, and then assign it to the original name. For example:
year <- 2001
dflist <- list(bhps01 = bhps01, bhps02 = bhps02, bhps03 = bhps03, bhps04 = bhps04, bhps05 = bhps05, bhps06 = bhps06, bhps07 = bhps07, bhps08 = bhps08)
counter <- 0
for (df in dflist){
counter <- counter + 1
df[["year"]] <- as.character(year)
dflist[[counter]] <- df
assign(names(dflist)[counter], df)
year <- year + 1
}

Looping row numbers from one dataframe to create new data using logical operations in R

I would like to extract a dataframe that shows how many years it takes for NInd variable (dataset p1) to recover due to some culling happening, which is showed in dataframe e1.
I have the following datasets (mine are much bigger, but just to give you something to play with):
# Dataset 1
Batch <- c(2,2,2,2,2,2,2,2,2,2)
Rep <- c(0,0,0,0,0,0,0,0,0,0)
Year <- c(0,0,1,1,2,2,3,3,4,4)
RepSeason <- c(0,0,0,0,0,0,0,0,0,0)
PatchID <- c(17,25,19,16,21,24,23,20,18,33)
Species <- c(0,0,0,0,0,0,0,0,0,0)
Selected <- c(1,1,1,1,1,1,1,1,1,1)
Nculled <- c(811,4068,1755,449,1195,1711,619,4332,457,5883)
e1 <- data.frame(Batch,Rep,Year,RepSeason,PatchID,Species,Selected,Nculled)
# Dataset 2
Batch <- c(2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2)
Rep <- c(0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)
Year <- c(0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2,2)
RepSeason <- c(0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)
PatchID <- c(17,25,19,16,21,24,23,20,18,33,17,25,19,16,21,24,23,20,18,33,17,25,19,16,21,24,23,20,18,33)
Ncells <- c(6,5,6,4,4,5,6,5,5,5,6,5,6,4,4,5,6,7,3,5,4,4,3,3,4,4,5,5,6,4)
Species <- c(0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)
NInd <- c(656,656,262,350,175,218,919,218,984,875,700,190,93,127,52,54,292,12,43,68,308,1000,98,29,656,656,262,350,175,300)
p1 <- data.frame(Batch, Rep, Year, RepSeason, PatchID, Ncells, Species, NInd)
The dataset called e1 shows only those year where some culled happened to the population on specific PatchID.
I have created the following script that basically use each row from e1 to create a Recovery number. Maybe there is an easier way to get to the end, but this is the one I managed to get...
When you run this, you are working on ONE row of e1, so we focus on the first PatchID encounter and then do some calculation to match that up with p1, and finally I get a number named Recovery.
Now, the thing is my dataframe has 50,000 rows, so doing this over and over looks quite tedious. So, that's where I thought a loop may be useful. But have tried and no luck on how to make it work at all...
#here is where I would like the loop
e2 <- e1[1,] # Trial for one row only # but the idea is having here a loop that keep doing of comes next for each row
e3 <- e2 %>%
select(1,2,4,5)
p2 <- p1[,c(1,2,4,5,3,6,7,8)] # Re-order
row2 <- which(apply(p2, 1, function(x) return(all(x == e3))))
p3 <- p1 %>%
slice(row2) # all years with that particular patch in that particular Batch
#How many times was this patch cull during this replicate?
e4 <- e2[,c(1,2,4,5,3,6,7,8)]
e4 <- e4 %>%
select(1,2,3,4)
c_batch <- e1[,c(1,2,4,5,3,6,7,8)]
row <- which(apply(c_batch, 1, function(x) return(all(x == e4))))
c4 <- c_batch %>%
slice(row)
# Number of year to recover to 95% that had before culled
c5 <- c4[1,] # extract the first time was culled
c5 <- c5 %>%
select(1:5)
row3 <- which(apply(p2, 1, function(x) return(all(x == c5))))
Before <- p2 %>%
slice(row3)
NInd <- Before[,8] # Before culling number of individuals
Year2 <- Before[,5] # Year number where first culling happened (that actually the number corresponds to individuals before culling, as the Pop file is developed during reproduction, while Cull file is developed after!)
Percent <- (95*NInd)/100 # 95% recovery we want to achieve would correspond to having 95% of NInd BEFORE culled happened (Year2)
After <- p3 %>%
filter(NInd >= Percent & Year > Year2) # Look rows that match number of ind and Year
After2 <- After[1,] # we just want the first year where the recovery was successfully achieved
Recovery <- After2$Year - Before$Year
# no. of years to reach 95% of the population immediately before the cull
I reckon that the end would have to change somehow to to tell R that we are creating a dataframe with the Recovery, something like:
Batch <- c(1,1,2,2)
Rep <- c(0,0,0,0)
PatchID <- c(17,25,30,12)
Recovery <- c(1,2,1,5)
Final <- data.frame(Batch, Rep, PatchID, Recovery)
Would that be possible? OR this is just too mess-up and I may should try a different way?
Does the following solve the problem correectly?
I have first added a unique ID to your data.frames to allow matching of the cull and population files (this saves most of you complicated look-up code):
# Add a unique ID for the patch/replicate etc. (as done in the example code)
e1$RepID = paste(e1$Batch, e1$Rep, e1$RepSeason, e1$PatchID, sep = ":")
p1$RepID = paste(p1$Batch, p1$Rep, p1$RepSeason, p1$PatchID, sep = ":")
If you want a quick overview of the number of times each patch was culled, the new RepID makes this easy:
# How many times was each patch culled?
table(p1$RepID)
Then you want a loop to check the recovery time after each cull.
My solutions uses an sapply loop (which also retains the RepIDs so you can match to other metadata later):
sapply(unique(e1$RepID), function(rep_id){
all_cull_events = e1[e1$RepID == rep_id, , drop = F]
first_year = order(all_cull_events$Year)[1] # The first cull year (assuming data might not be in temporal order)
first_cull_event = all_cull_events[first_year, ] # The row corresponding to the first cull event
population_counts = p1[p1$RepID == first_cull_event$RepID, ] # The population counts for this plot/replicate
population_counts = population_counts[order(population_counts$Year), ] # Order by year (assuming data might not be in temporal order)
pop_at_first_cull_event = population_counts[population_counts$Year == first_cull_event$Year, "NInd"]
population_counts_after_cull = population_counts[population_counts$Year > first_cull_event$Year, , drop = F]
years_to_recovery = which(population_counts_after_cull$NInd >= (pop_at_first_cull_event * .95))[1] # First year to pass 95% threshold
return(years_to_recovery)
})
2:0:0:17 2:0:0:25 2:0:0:19 2:0:0:16 2:0:0:21 2:0:0:24 2:0:0:23 2:0:0:20 2:0:0:18 2:0:0:33
1 2 1 NA NA NA NA NA NA NA
(The output contains some NAs because the first cull year was outside the range of population counts in the data you gave us)
Please check this against your expected output though. There were some aspects of the question and example code that were not clear (see comments).

Pull out only two variables from a column

I have a dataframe in R for which one column has multiple variables. The variables either start with ABC, DEF, GHI. Those variables are followed by a series of 6 numbers (ie ABC052689, ABC062895, DEF045158).
For each row, i would like to pull one instance of ABC (the one with the largest number).
If the row has ABC052689, ABC062895, DEF045158, I would like it to pull out ABC062895 because it is greater than ABC052689.
I would then want to do the same for the variable that starts with DEF######.
I have managed to filter the data to have rows where ABC is there and either DEF or GHI is there:
library(tidyverse)
data_with_ABC <- test %>%
filter(str_detect(car,"ABC"))
data_with_ABC_and_DEF_or_GHI <- data_with_ABC %>%
filter(str_detect(car, "DEF") | str_detect(car, "GHI"))
I don't know how to pull out let's say ABC with the greatest number
ABC052689, ABC062895, DEF045158 -> ABC062895
For a base R solution, we can try using lapply along with strsplit to identify the greatest ABC plate in each CSV string, in each row.
df <- data.frame(car=c("ABC052689,ABC062895,DEF045158"), id=c(1),
stringsAsFactors=FALSE)
df$largest <- lapply(df$car, function(x) {
cars <- strsplit(x, ",", fixed=TRUE)[[1]]
cars <- cars[substr(cars, 1, 3) == "ABC"]
max <- cars[which.max(substr(cars, 4, 9))]
return(max)
})
df
car id largest
1 ABC052689,ABC062895,DEF045158 1 ABC062895
Note that we don't need to worry about casting the substring of the plate number, because it is fixed width text. This means that it should sort properly even as text.
Besides Tim's answer, if you want to do all ABC/DEF at one time, following code may help with library(tidyverse):
> df <- data.frame(car=c("ABC052689", "ABC062895", "DEF045158", "DEF192345"), stringsAsFactors=FALSE)
>
> df2 = df %>%
+ mutate(state = str_sub(car, 1, 3), plate = str_sub(car, 4, 9))
>
> df2
car state plate
1 ABC052689 ABC 052689
2 ABC062895 ABC 062895
3 DEF045158 DEF 045158
4 DEF192345 DEF 192345
>
> df2 %>%
+ group_by(state) %>%
+ summarise(maxplate = max(plate)) %>%
+ mutate(full = str_c(state, maxplate))
# A tibble: 2 x 3
state maxplate full
<chr> <chr> <chr>
1 ABC 062895 ABC062895
2 DEF 192345 DEF192345

Calculating grades in r

I am calculating final averages for a course. There are about 500 students, and the grades are organized into a .csv file. Column headers include:
Name, HW1, ..., HW10, Quiz1, ..., Quiz5, Exam1, Exam2, Final
Each is weighted differently, and that shouldn't be an issue programming. However, the lowest 2 HW and the lowest Quiz are dropped for each student. How could I program this in r? Note that the HW/Quiz dropped for each student may be different (i.e. Student A has HW2, HW5, Quiz2 dropped, Student B has HW4, HW8, Quiz1 dropped).
Here is a simpler solution. The sum_after_drop function takes a vector x and drops the i lowest scores and sums up the remaining. We invoke this function for each row in the dataset. ddply is overkill for this job, but keeps things simple. You should be able to do this with apply, except that you will have to convert the end result to a data frame.
The actual grade calculations can then be carried out on dd2. Note that using the cut function with breaks is a simple way to get letter grades from the total scores.
library(plyr)
sum_after_drop <- function(x, i){
sum(sort(x)[-(1:i)])
}
dd2 = ddply(dd, .(Name), function(d){
hw = sum_after_drop(d[,grepl("HW", nms)], 1)
qz = sum_after_drop(d[,grepl("Quiz", nms)], 1)
data.frame(hw = hw, qz = qz)
})
Here's a sketch of how you could approach it using the reshape2 package and base functions.
#sample data
set.seed(734)
dd<-data.frame(
Name=letters[1:20],
HW1=rpois(20,7),
HW2=rpois(20,7),
HW3=rpois(20,7),
Quiz1=rpois(20,15),
Quiz2=rpois(20,15),
Quiz3=rpois(20,15)
)
Now I convert it to long format and split apart the field names
require(reshape2)
mm<-melt(dd, "Name")
mm<-cbind(mm,
colsplit(gsub("(\\w+)(\\d+)","\\1:\\2",mm$variable, perl=T), ":",
names=c("type","number"))
)
Now i can use by() to get a data.frame for each name and do the rest of the calculations. Here i just drop the lowest homework and lowest quiz and i give homework a weight of .2 and quizzes a weight of .8 (assuming all home works were worth 15pts and quizzes 25 pts).
grades<-unclass(by(mm, mm$Name, function(x) {
hw <- tail(sort(x$value[x$type=="HW"]), -1);
quiz <- tail(sort(x$value[x$type=="Quiz"]), -1);
(sum(hw)*.2 + sum(quiz)*.8) / (length(hw)*15*.2+length(quiz)*25*.8)
}))
attr(grades, "call")<-NULL #get rid of crud from by()
grades;
Let's check our work. Look at student "c"
Name HW1 HW2 HW3 Quiz1 Quiz2 Quiz3
c 6 9 7 21 20 14
Their grade should be
((9+7)*.2+(21+20)*.8) / ((15+15)*.2 + (25+25)*.8) = 0.7826087
and in fact, we see
grades["c"] == 0.7826087
Here's a solution with dplyr. It ranks the scores by student and type of assignment (i.e. calculates the rank order of all of student 1's homeworks, etc.), then filters out the lowest 1 (or 2, or whatever). dplyr's syntax is pretty intuitive—you should be able to walk through the code fairly easily.
# Load libraries
library(reshape2)
library(dplyr)
# Sample data
grades <- data.frame(name=c("Sally", "Jim"),
HW1=c(10, 9),
HW2=c(10, 5),
HW3=c(5, 10),
HW4=c(6, 9),
HW5=c(8, 9),
Quiz1=c(9, 5),
Quiz2=c(9, 10),
Quiz3=c(10, 8),
Exam1=c(95, 96))
# Melt into long form
grades.long <- melt(grades, id.vars="name", variable.name="graded.name") %.%
mutate(graded.type=factor(sub("\\d+","", graded.name)))
grades.long
# Remove the lowest scores for each graded type
grades.filtered <- grades.long %.%
group_by(name, graded.type) %.%
mutate(ranked.score=rank(value, ties.method="first")) %.% # Rank all the scores
filter((ranked.score > 2 & graded.type=="HW") | # Ignore the lowest two HWs
(ranked.score > 1 & graded.type=="Quiz") | # Ignore the lowest quiz
(graded.type=="Exam"))
grades.filtered
# Calculate the average for each graded type
grade.totals <- grades.filtered %.%
group_by(name, graded.type) %.%
summarize(total=mean(value))
grade.totals
# Unmelt, just for fun
final.grades <- dcast(grade.totals, name ~ graded.type, value.var="total")
final.grades
You technically could add the summarize(total=mean(value)) to the grades.filtered data frame rather than making a separate grade.totals data frame—I separated them into multiple data frames for didactical reasons.

Resources