Scraping table in RVest when there are multiple rows that span columns - r

I am trying to scrape the following webpage: https://rptsvr1.tea.texas.gov/cgi/sas/broker?_service=marykay&_program=sfadhoc.budget_report_2022.sas&_service=appserv&_debug=0&who_box=&who_list=031901
Initially, I tried this code:
library(rvest)
library(tidyverse)
webpage = read_html("https://rptsvr1.tea.texas.gov/cgi/sas/broker?_service=marykay&_program=sfadhoc.budget_report_2022.sas&_service=appserv&_debug=0&who_box=&who_list=101912")
tables <- html_nodes(webpage, "table") %>%
html_table()
budget = tables[[2]]
Then I realized that the results were messy, because html_table() cannot read tables where rows span multiple columns. I could clean the dataframe up after scraping it, but I'm wondering if perhaps there's a way to scrape it that avoids the issue in the first place.
I read the answers to similar questions, but they all involved a situation where only the header rows spanned multiple columns, or only the first column spanned multiple rows. In this case, this table is made up of multiple tables squished together, so it's like there are headers all throughout the table. Is there a solution that can deal with rows spanning columns throughout the table?

One way could be as follows:
Decide on your final format. I have selected on a flat file format which entails splitting some header values, those with merged cells, into separate new columns, and repeating these values later
Gather and loop all the rows of data
In the loop process the row according to a set of rules
Generate your output row in the loop and write to a sized list
Drop empty entries list
Convert to DataFrame and add any additional info wanted
The rules I chose to apply where based on looking at the first column of each row as follows (pseudo-code):
if (first column className contains "linecontent") {
if (if first column has br and span child elements) {
split colmn text, after trimming, on line break
assign split to red and measure columns (re-use as filldown)
} else {
assign measure the trimmed column text
}
} else {
we are on a non-header row so grab the $ and % values by matching on class "data"
}
R:
library(tidyverse)
library(rvest)
library(httr2)
page <- request("https://rptsvr1.tea.texas.gov/cgi/sas/broker?_service=marykay&_program=sfadhoc.budget_report_2022.sas&_service=appserv&_debug=0&who_box=&who_list=031901") %>%
req_headers(
"user-agent" = "Mozilla/5.0",
"accept" = "text/html",
"connection" = "keep-alive"
) %>%
req_perform() %>%
resp_body_html()
rows <- page %>% html_elements(".table tbody tr")
row_data <- vector("list", length(rows))
row <- 1
for (row_node in rows) {
first_column_node <- row_node %>% html_element("td")
first_column_node_classname <- html_attr(first_column_node, "class")
is_new_column <- if_else(grepl("linecontent", first_column_node_classname), 1, 0)
column_needs_split <- if_else((length(html_elements(first_column_node, "br")) > 0 &
!is.na(html_element(first_column_node, "span"))), 1, 0)
node_text <- first_column_node %>% html_text2()
data <- NULL
if (is_new_column) {
if (column_needs_split) {
new_values <- trimws(str_split_1(trimws(node_text), "\\n"))
red <- new_values[[1]]
measure <- new_values[[2]]
} else {
measure <- trimws(node_text)
}
} else {
data <- row_node %>%
html_elements(".data") %>%
html_text2()
}
if (!is.null(data)) {
row_data[[row]] <- c(c(red, measure), data)
# print(c(c(red, measure), data))
row <- row + 1
}
}
row_data <- discard(row_data, is.null)
df <- do.call(rbind, row_data) %>% as.data.frame()
colnames(df) <- c(
"red", "category", "measure",
"gen_fund", "gen_fund_perc", "gen_fund_per_student",
"all_fund", "all_fund_perc", "all_fund_per_student"
)
summary_info <- page %>% html_element('.c.systemtitle') %>% html_text(trim = T)
additional_info <- str_match_all(
gsub("\\n", "", summary_info),
"(\\d{4}\\s-\\s\\d{4}).*Totals for (.*?)\\sISD.*?\\((\\d{6})\\)"
)
df$year <- additional_info[[1]][, 2]
df$district <- additional_info[[1]][, 3]
df$isd_code <- additional_info[[1]][, 4]
head(df)
Sample output:
> head(df)
red category measure gen_fund gen_fund_perc
1 Revenues Operating Revenue Local Property Tax from M&O (excluding recapture) $70,019,020 15.03%
2 Revenues Operating Revenue State Operating Funds $333,999,269 71.68%
3 Revenues Operating Revenue Federal Funds $59,326,937 12.73%
4 Revenues Operating Revenue Other Local $2,644,317 0.57%
5 Revenues Operating Revenue Total Operating Revenue $465,989,543 100.00%
6 Revenues Other Revenue Local Property Tax from I&S $0 0.00%
gen_fund_per_student all_fund all_fund_perc all_fund_per_student year district isd_code
1 $1,823 $70,019,020 15.03% $1,823 2021 - 2022 BROWNSVILLE 031901
2 $8,695 $333,999,269 71.68% $8,695 2021 - 2022 BROWNSVILLE 031901
3 $1,544 $59,326,937 12.73% $1,544 2021 - 2022 BROWNSVILLE 031901
4 $69 $2,644,317 0.57% $69 2021 - 2022 BROWNSVILLE 031901
5 $12,131 $465,989,543 100.00% $12,131 2021 - 2022 BROWNSVILLE 031901
6 $0 $9,212,992 17.23% $240 2021 - 2022 BROWNSVILLE 031901

Related

A question about looping and creating/joining tables

My code is meant to order a table called Football (imported csv2) and then, using a for loop, go through the data and return the row number of the start year and end year.
Football[order(Football$Year),]
start_year <- min(Football$Year)
end_year <- max(Football$Year)
for (i in 1:nrow(Football)
{
if (Football$Year[i] = start_year)
{
row_of_start <- i
}
if (Football$Year[i] = end_year)
{
row_of_end <- i
}
}
This produces the following error:
> if (Football$Year[1] = start_year) row_of_start <- 1
Error: unexpected '=' in "if (Football$Year[1] ="
I appreciate there are probably ways of doing this without a for loop (which I would be very appreciative to know) although I would also like to know how to make the for loop work (to further my understanding).
You can skip the loop entirely using which(). This will usually be faster and more legible:
# Create example data
set.seed(123)
Football <- data.frame(Year = sample(1990:2000, size = 10),
foo = sample(letters, size = 10))
# Sort the data as you have done
Football_sort <- Football[order(Football$Year), ]
# Get the row numbers of the min and max (start and end years)
which(with(Football_sort, Year == min(Year)))
#> [1] 1
which(with(Football_sort, Year == max(Year)))
#> [1] 10
Depending upon what you actually want to do, you can skip the ordering step as well. Both of the below depend upon the dplyr package to work.
If you just want the start and end year rows rather than their row numbers:
library(dplyr)
Football %>%
filter(Year %in% c(min(Year), max(Year)))
#> Year foo
#> 1 2000 e
#> 2 1990 d
If you want the "year number" of the start and end year:
Football %>%
summarise(start_year = 1,
end_year = max(Year) - min(Year))
#> start_year end_year
#> 1 1 10

Problem formatting spreadsheets in R, how can I read and write to tables using R?

I'm working with R for the first time for a class in college. To preface this: I don't know enough to know what I don't know, so I'm sorry if this question has been asked before. I am trying to predict the results of the Texas state house elections in 2020, and I think the best prior for that is the results of the 2018 state house elections. There are 150 races, so I can't bare to input them all by hand, but I can't find any spreadsheet that has data formatted how I want it. I want it in a pretty standard table format:
My desired table format. However, the table from the Secretary of state I have looks like the following:
Gross ugly table.
I wrote some psuedo code:
Here's the Psuedo Code, basically we want to construct a new CSV:
'''%First, we want to find a district, the house races are always preceded by a line of dashes, so I will need a function like this:
Create a New CSV;
for(x=1; x<151 ; x +=1){
Assign x to the cell under the district number cloumn;
Find "---------------" ;
Go down one line;
Go over two lines;
% We should now be in the third column and now want to read in which party got how many votes. The number of parties is not consistant, so we need to account for uncontested races, libertarians, greens, and write ins. I want totals for Republicans, Democrats, and Other.
while(cell is not empty){
Party <- function which reads cell (but I want to read a string);
go right one column;
Votes <- function which reads cell (but I want to read an integer);
if(Party = Rep){
put this data in place in new CSV;
else if (Party = Dem)
put this data in place in new CSV;
else
OtherVote += Votes;
};
};
Assign OtherVote to the column for other party;
OtherVote <- 0;
%Now I want to assign 0 to null cells (ones where no rep, or no Dem, or no other party contested
read through single row 4 spaces, if its null assign it 0;
Party <- null
};'''
But I don't know enough to google what to do! Here's what I need help with: Can I create a new CSV in Rstudio, how? How can I read specific cells in a table, hopefully indexing? Lastly, how do I write to a table in R. Any help is appreciated! Thank you!
Can I create a new CSV in Rstudio, how?
Yes you can. Use the "write.csv" function.
write.csv(df, file = "df.csv") #see help for more information.
How can I read specific cells in a table?
Use the brackets after df,example below.
df <- data.frame(x = c(1,2,3), y = c("A","B","C"), z = c(15,25,35))
df[1,1]
#[1] 1
df[1,1:2]
# x y
#1 1 A
How do I write to a table in R?
If you want to write a table in xlsx use the function write.xlsx from openxlsx package.
Wikipedia seems to have a table that is closer to the format you are looking for.
In order to get to the table you are looking for we need a few steps:
Download data from Wikipedia and extract table.
Clean up table.
Select columns.
Calculate margins.
1. Download data from wikipedia and extract table.
The rvest table helps with downloading and parsing websites into R objects.
First we download the HTML of the whole website.
library(dplyr)
library(rvest)
wiki_html <-
read_html(
"https://en.wikipedia.org/wiki/2018_United_States_House_of_Representatives_elections_in_Texas"
)
There are a few ways to get a specific object from an HTML file in this case
I dedided to look for the table that has the class name “wikitable plainrowheaders sortable”,
as I learned from inspecting the code, that the only table with that class is
the one we want to extract.
library(purrr)
html_nodes(wiki_html, "table") %>%
map_lgl( ~ html_attr(., "class") == "wikitable plainrowheaders sortable") %>%
which()
#> [1] 20
Then we can select table number 20 and convert it to a dataframe with html_table()
raw_table <-
html_nodes(wiki_html, "table")[[20]] %>%
html_table(fill = TRUE)
2. Clean up table.
The table has duplicated names, we can change that by using as_tibble() and its .name_repair argument. We then usedplyr::select() to get the columns. Furthermore we usedplyr::filter() to delete the first two rows, that have "District" as a value in theDistrictcolumn. Now the columns are still characters
vectors, but we need them to be numeric, therefore we first delete commas from
all columns and then transform columns 2 to 4 to numeric.
clean_table <-
raw_table %>%
as_tibble(.name_repair = "unique") %>%
filter(District != "District") %>%
mutate_all( ~ gsub(",", "", .)) %>%
mutate_at(2:4, as.numeric)
3. Select columns and 4. Calculate margins.
We use dplyr::select() to select the columns you are interested in and give them more helpful names.
Finally we calculate the margin between democratic and republican votes by first adding up there votes
as total_votes and then dividing the difference by total_votes.
clean_table %>%
select(District,
RepVote = Republican...2,
DemVote = Democratic...4,
OthVote = Others...6) %>%
mutate(
total_votes = RepVote + DemVote,
margin = abs(RepVote - DemVote) / total_votes * 100
)
#> # A tibble: 37 x 6
#> District RepVote DemVote OthVote total_votes margin
#> <chr> <dbl> <dbl> <chr> <dbl> <dbl>
#> 1 District 1 168165 61263 3292 229428 46.6
#> 2 District 2 139188 119992 4212 259180 7.41
#> 3 District 3 169520 138234 4604 307754 10.2
#> 4 District 4 188667 57400 3178 246067 53.3
#> 5 District 5 130617 78666 224 209283 24.8
#> 6 District 6 135961 116350 3731 252311 7.77
#> 7 District 7 115642 127959 0 243601 5.06
#> 8 District 8 200619 67930 4621 268549 49.4
#> 9 District 9 0 136256 16745 136256 100
#> 10 District 10 157166 144034 6627 301200 4.36
#> # … with 27 more rows
Edit: In case you want to go with the data provided by the state, it looks to me as if the data you are looking for is in the first, third and fourth column. So what you want to do is.
(All the code below is not tested, as I do not have the original data.)
read data into R
library(readr)
tx18 <- read_csv("filename.csv")
select relevant columns
tx18 <- tx18 %>%
select(c(1,3,4))
clean table
tx18 <- tx18 %>%
filter(!is.na(X3),
X3 != "Party",
X3 != "Race Total")
Group and summarize data by party
tx18 <- tx18 %>%
group_by(X3) %>%
summarise(votes = sum(X3))
Pivot/ Reshape data to wide format
tx18 %>$
pivot_wider(names_from = X3,
values_from = votes)
After this you could then calculate the margin similarly as I did with the Wikipedia data.

.TXT in long form to data.frame in wide form in R

I am currently working with clinical assessment data that is scored and output by a software package in a .txt file. My goal is extract the data from the txt file into a long format data frame with a column for: Participant # (which is included in the file name), subtest, Score, and T-score.
An example data file is available here:
https://github.com/AlexSwiderski/CatTextToData/blob/master/Example_data
I am running into a couple road blocks that I could use some input into how navigate.
1) I only need the information that corresponds to each subtest, these all have a number prior to the subtest name. Therefore, the rows that only have one to two words that are not necessary (eg cognitive screen) seem to be interfering creating new data frames because I have a mismatch in columns provided and columns wanted.
Some additional corks to the data:
1) the asteriks are NOT necessary
2) the cognitive TOTAL will never have a value
I am utilizing the readtext package to import the data at the moment and I am able to get a data frame with two columns. One being the file name (this includes the participant name) so that problem is fixed. However, the next column is a a giant character string with the columns data points for both Score and T-Score. Presumably I would then need to split these into the columns of interest, previously listed.
Next problem, when I view the data the T scores are in the correct order, however the "score" data no longer matches the true values.
Here is what I have tried:
# install.packages("readtext")
library(readtext)
library(tidyr)
pathTofile <- path.expand("/Users/Brahma/Desktop/CAT TEXT FILES/")
data <- readtext(paste0(pathTofile2, "CAToutput.txt"),
#docvarsfrom = "filenames",
dvsep = " ")
From here I do not know how to split the data, in my head I would do something like this
data2 <- separate(data2, text, sep = " ", into = c("subtest", "score", "t_score"))
This of course, gives the correct column names but removes almost all the data I actually am interested in.
Any help would be appreciated whether a solution or a direction you might suggest I look for more answers.
Sincerely,
Alex
Here is a way of converting that text file to a dataframe that you can do analysis on
library(tidyverse)
input <- read_lines('c:/temp/scores.txt')
# do the match and keep only the second column
header <- as_tibble(str_match(input, "^(.*?)\\s+Score.*")[, 2, drop = FALSE])
colnames(header) <- 'title'
# add index to the list so we can match the scores that come after
header <- header %>%
mutate(row = row_number()) %>%
fill(title) # copy title down
# pull off the scores on the numbered rows
scores <- str_match(input, "^([0-9]+[. ]+)(.*?)\\s+([0-9]+)\\s+([0-9*]+)$")
scores <- as_tibble(scores) %>%
mutate(row = row_number())
# keep only rows that are numbered and delete first column
scores <- scores[!is.na(scores[,1]), -1]
# merge the header with the scores to give each section
table <- left_join(scores,
header,
by = 'row'
)
colnames(table) <- c('index', 'type', 'Score', 'T-Score', 'row', 'title')
head(table, 10)
# A tibble: 10 x 6
index type Score `T-Score` row title
<chr> <chr> <chr> <chr> <int> <chr>
1 "1. " Line Bisection 9 53 3 Subtest/Section
2 "2. " Semantic Memory 8 51 4 Subtest/Section
3 "3. " Word Fluency 1 56* 5 Subtest/Section
4 "4. " Recognition Memory 40 59 6 Subtest/Section
5 "5. " Gesture Object Use 2 68 7 Subtest/Section
6 "6. " Arithmetic 5 49 8 Subtest/Section
7 "7. " Spoken Words 17 45* 14 Spoken Language
8 "9. " Spoken Sentences 25 53* 15 Spoken Language
9 "11. " Spoken Paragraphs 4 60 16 Spoken Language
10 "8. " Written Words 14 45* 20 Written Language
What is the source for the code at the link provided?
https://github.com/AlexSwiderski/CatTextToData/blob/master/Example_data
This data is odd. I was able to successfully match patterns and manipulate most of the data, but two rows refused to oblige. Rows 17 and 20 refused to be matched. In addition, the data type / data structure are very unfamiliar.
This is what was accomplished before hitting a wall.
df <- read.csv("test.txt", header = FALSE, sep = ".", skip = 1)
df1 <- df %>% mutate(V2, Extract = str_extract(df$V2, "[1-9]+\\s[1-9]+\\*+\\s?"))
df2 <- df1 %>% mutate(V2, Extract2 = str_extract(df1$V2, "[0-9]+.[0-9]+$"))
head(df2)
When the data was further explored, the second column, V2, included data types that are completely unfamiliar. These included: Arithmetic, Complex Words, Digit Strings, and Function Words.
If anything, it would good to know something about those unfamiliar data types.
Took another look at this problem and found where it had gotten off track. Ignore my previous post. This solution works in Jupyter Lab using the data that was provided.
library(stringr)
library(dplyr)
df <- read.csv("test.txt", header = FALSE, sep = ".", skip = 1)
df1 <- df %>% mutate(V2, "Score" = str_extract(df$V2, "\\d+") )
df2 <- df1 %>% mutate(V2, "T Score" = str_extract(df$V2, "\\d\\d\\*?$"))
df3 <- df2 %>% mutate(V2, "Subtest/Section" = str_remove_all(df2$V2, "\\\t+[0-9]+"))
df4 <- df3 %>% mutate(V1, "Sub-S" = str_extract(df3$V1, "\\s\\d\\d\\s*"))
df5 <- df4 %>% mutate(V1, "Sub-T" = str_extract(df4$V1,"\\d\\d\\*"))
df6 <- replace(df5, is.na(df5), "")
df7 <- df6 %>% mutate(V1, "Description" = str_remove_all(V1, "\\d\\d\\s\\d\\d\\**$")) # remove digits, new variable
df7$V1 <- NULL # remove variable
df7$V2 <- NULL # remove variable
df8 <- df7[, c(6,3,1,4,2,5)] # re-align variables
head(df8,15)

R scrape html table and extract background color

I am trying to scrape some data off a wikipedia table from this page:
https://en.wikipedia.org/wiki/Results_of_the_Indian_general_election,_2014 and I am interested in the table:
Summary of the 2014 Indian general election
I would also like to extract the party colors from the table.
Here's what I've tried so far:
library("rvest")
url <-
"https://en.wikipedia.org/wiki/Results_of_the_Indian_general_election,_2014"
electionstats <- read_html(url)
results <- html_nodes(electionstats, xpath='//*[#id="mw-content-text"]/div/table[79]') %>% html_table(fill = T)
party_colors <- electionstats %>%
html_nodes(xpath='//*[#id="mw-content-text"]/div/table[3]') %>%
html_table(fill = T)
Printing out party_colors does not show any info about the colors
So, I tried:
party_colors <- electionstats %>% html_nodes(xpath='//*[#id="mw-content-text"]/div/table[3]') %>%
html_nodes('tr')
Now if I print out party_colors, I get:
[1] <tr style="background-color:#E9E9E9">\n<th style="text-align:left;vertical-align:bottom;" rowspan="2"></th>\n<th style="text-align:left; ...
[2] <tr style="background-color:#E9E9E9">\n<th style="text-align:center;">No.</th>\n<th style="text-align:center;">+/-</th>\n<th style="text ...
[3] <tr>\n<td style="background-color:#FF9933"></td>\n<td style="text-align:left;"><a href="/wiki/Bharatiya_Janata_Party" title="Bharatiya J ...
[4] <tr>\n<td style="background-color:#00BFFF"></td>\n<td style="text-align:left;"><a href="/wiki/Indian_National_Congress" title="Indian Na ...
[5] <tr>\n<td style="background-color:#009900"></td>\n<td style="text-align:left;"><a href="/wiki/All_India_Anna_Dravida_Munnetra_Kazhagam" ...
and so on...
But, now, I have no idea how to pull out the colors from this data. I cannot convert the output of the above to a html_table with:
html_table(fill = T)
I get the error:
Error: html_name(x) == "table" is not TRUE
I also tried various options with html_attrs, but I have no idea what the correct attribute I should be passing is.
I even tried SelectorGadget to try and figure out the attribute, but if I select the first column of the table in question, SelectorGadget shows just "td".
I would get the table like you did and then add the color attribute as a column. The wikitable sortable class works on many pages, so get the first one and remove the second header in row 1.
electionstats <- read_html(url)
x <- html_nodes(electionstats, xpath='//table[#class="wikitable sortable"]')[[1]] %>%
html_table(fill=TRUE)
# paste names from 2nd row header and then remove
names(x)[6:14] <- paste(names(x)[6:14], x[1,6:14])
x <- x[-1,]
The colors are in the first tr/td tags and you can add it to empty column 1 or 3 (see str(x))
names(x)[3] <- "Color"
x$Color <- html_nodes(electionstats, xpath='//table[#class="wikitable sortable"][1]/tr/td[1]') %>%
html_attr("style") %>% gsub("background-color:", "", .)
## drop table footer, extra columns
x <- x[1:83, 2:14]
head(x)
Party Color Alliance Abbr. Candidates No. Candidates +/- Candidates %
2 Bharatiya Janata Party #FF9933 NDA BJP 428 -5 78.82%
3 Indian National Congress #00BFFF UPA INC 464 24 85.45%
4 All India Anna Dravida Munnetra Kazhagam #009900 ADMK 40 17 7.37%
5 All India Trinamool Congress #00FF00 AITC 131 96 24.13%
6 Biju Janata Dal #006400 BJD 21 3 3.87%
7 Shiv Sena #E3882D NDA SHS 24 11 10.68%
Looks like your xml_nodeset contains both tr and td nodes.
Deal with both trs and tds, converting to data frames:
party_colors_tr <- electionstats %>% html_nodes(xpath='//*[#id="mw-content-text"]/div/table[3]') %>% html_nodes('tr')
trs <- bind_rows(lapply(xml_attrs(party_colors_tr), function(x) data.frame(as.list(x), stringsAsFactors=FALSE)))
party_colors_td <- electionstats %>% html_nodes(xpath='//*[#id="mw-content-text"]/div/table[3]') %>% html_nodes('tr') %>% html_nodes('td')
tds <- bind_rows(lapply(xml_attrs(party_colors_td), function(x) data.frame(as.list(x), stringsAsFactors=FALSE)))
Write function for extracting styles from data frames:
library(stringi)
list_styles <- function(nodes_frame) {
get_cols <- function(x) { stri_detect_fixed(x, 'background-color') }
has_style <- which(lapply(nodes_frame$style, get_cols) == TRUE)
res <- strsplit(nodes_frame[has_style,]$style, ':')
return(res)
}
Create data frame of extracted styles:
l_trs <- list_styles(trs)
df_trs <- data.frame(do.call('rbind', l_trs)[,1], do.call('rbind', l_trs)[,2])
names(df_trs) <- c('style', 'color')
l_tds <- list_styles(tds)
df_tds <- data.frame(do.call('rbind', l_tds)[,1], do.call('rbind', l_tds)[,2])
names(df_tds) <- c('style', 'color')
Combine trs and tds frames:
final_style_frame <- do.call('rbind', list(df_trs, df_tds))
Here are the first 20 rows:
final_style_frame[1:20,]

Reshape data frame for consecutive years

I have data about thousands of customers who visited stores in the 3 past years.
For each customer, I have :
ID
Combination of a year and the first store visited in this year.
Customer_Id | Year_*_Store
1 2010_A
1 2011_B
1 2012_C
2 2010_A
2 2011_B
2 2012_D
What I’d like to have is the following structure of data in order to visualize the evolution of the customers’behaviour with a riverplot( aka Sankey plot)
For instance the 2 customers, who firstly visited the store A in 2010, firstly visited the store B in 2011:
SOURCE | TARGET | NB_CUSTOMERS
2010_A 2011_B 2
2011_B 2012_C 1
2011_B 2012_D 1
I don't want links between two years which are not consecutive like 2010_A and 2012_D
How can I do that in R ?
I would do this with dplyr (faster)
df<-read.table(header=T,text="Customer_Id Year_Store
1 2010_A
1 2011_B
1 2012_C
2 2010_A
2 2011_B
2 2012_D")
require(dplyr) # for aggregation
require(riverplot) # for Sankey
targets<-
group_by(df,Customer_Id) %.% # group by Customer
mutate(source=Year_Store,target=c(as.character(Year_Store)[-1],NA)) %.% # add a lag to show the shift
filter(!is.na(target)) %.% # filter out empty edges
regroup(list("source","target")) %.% # regroup by source & target
summarise(len=length(Customer_Id)) %.% # count customers for relationship
mutate(step=as.integer(substr(target,1,4))-as.integer(substr(source,1,4))) %.% # add a step to show how many years
filter(step==1) # filter out relationships for non consec years
topnodes <- c(as.character(unique(df$Year_Store))) # unique nodes
nodes <- data.frame( ID=topnodes, # IDs
x=as.numeric(substr(topnodes,1,4)), # x value for plot
col= rainbow(length(topnodes)), # color each different
labels= topnodes, # labels
stringsAsFactors= FALSE )
edges<- # create list of list
lapply(unique(targets$source),function(x){
l<-as.list(filter(targets,source==x)$len) # targets per source
names(l)<-filter(targets,source==x)$target # name of target
l
})
names(edges)<-unique(targets$source) # name top level nodes
r <- makeRiver( nodes, edges) # make the River
plot( r ) # plot it!
Note that you can't have a * in column names (see ?make.names). Here is a basic approach:
Split Year_store into two separate columns Year and Store in your data frame; at the moment it contains two completely different kinds of data and you actually need to process them separately.
Make a NextYear column, defined as Year + 1
Make a NextStore column, in which you assign the store code matching Customer_Id and for which Year is the same as this row's NextYear, assigning NA if there is no record of the customer visiting a store the next year, and throwing an error if the data do not meet the required specification (are ambiguous about which store was visited first the next year).
Strip out any of the rows in which NextStore is NA, and combine the NextYear and NextStore columns into a NextYear_NextStore column.
Summarize your data frame by the Year_store and NextYear_NextStore columns e.g. using ddply in the plyr package.
Some sample data:
# same example data as question
customer.df <- data.frame(Customer_Id = c(1, 1, 1, 2, 2, 2),
Year_Store = c("2010_A", "2011_B", "2012_C", "2010_A", "2011_B", "2012_D"),
stringsAsFactors = FALSE)
# alternative data should throw error, customer 2 is inconsistent in 2011
badCustomer.df <- data.frame(Customer_Id = c(1, 1, 1, 2, 2, 2),
Year_Store = c("2010_A", "2011_B", "2012_C", "2010_A", "2011_B", "2011_D"),
stringsAsFactors = FALSE)
And an implementation:
require(plyr)
splitYearStore <- function(df) {
df$Year <- as.numeric(substring(df$Year_Store, 1, 4))
df$Store <- as.character(substring(df$Year_Store, 6))
return(df)
}
findNextStore <- function(df, matchCust, matchYear) {
matchingStore <- with(df,
df[Customer_Id == matchCust & Year == matchYear, "Store"])
if (length(matchingStore) == 0) {
return(NA)
} else if (length(matchingStore) > 1) {
errorString <- paste("Inconsistent store results for customer",
matchCust, "in year", matchYear)
stop(errorString)
} else {
return(matchingStore)
}
}
tabulateTransitions <- function(df) {
df <- splitYearStore(df)
df$NextYear <- df$Year + 1
df$NextStore <- mapply(findNextStore, matchCust = df$Customer_Id,
matchYear = df$NextYear, MoreArgs = list(df = df))
df$NextYear_NextStore <- with(df, paste(NextYear, NextStore, sep = "_"))
df <- df[!is.na(df$NextStore),]
df <- ddply(df, .(Source = Year_Store, Target = NextYear_NextStore),
summarise, No_Customers = length(Customer_Id))
return(df)
}
Results:
> tabulateTransitions(customer.df)
Source Target No_Customers
1 2010_A 2011_B 2
2 2011_B 2012_C 1
3 2011_B 2012_D 1
> tabulateTransitions(badCustomer.df)
Error in function (df, matchCust, matchYear) :
Inconsistent store results for customer 2 in year 2011
No attempt has been made to optimise; if your data set is massive then perhaps you should investigate a data.table solution.

Resources