Rearranging order of data frame rows based on a vector - r

I have the following data frame above and a vector x= 1, 156, 153, 3 , 185. My vector corresponds to node.id column showed in the picture and I would like to rearrange the rows of the data frame to match up with the order of my vector. So the order of the data frame rows should be the row with node.id= 1, then 156, 153,3,185. Hopefully, I explained this well enough.

We can use match
df1[match(df1$node.id, x),]
# x lon lat node.id name
#1 1 -122.41 37.70 1 San Francisco
#4 22 -117.16 32.71 156 San Diego
#3 21 -118.24 34.05 153 Los Angeles
#2 3 -115.14 36.16 3 Las Vegas
#5 26 -112.08 38.77 185 Richfield
data
df1 <- structure(list(x = c(1, 3, 21, 22, 26), lon = c(-122.41, -115.14,
-118.24, -117.16, -112.08), lat = c(37.7, 36.16, 34.05, 32.71,
38.77), node.id = c(1, 3, 153, 156, 185), name = structure(c(5L,
1L, 2L, 4L, 3L), .Label = c("Las Vegas", "Los Angeles", "Richfield",
"San Diego", "San Francisco"), class = "factor")),
class = "data.frame", row.names = c(NA,
-5L))
x <- c(1, 156, 153, 3, 185)

Related

How to join two dataframes containing time varying variables in R

This seems like a simple enough thing but I can't figure it out nor find an answer online - apologies if it something obvious. I have two seperate dataframes containing the same patients with the same unique identifier. Both datasets have time varying variables - one continuous and one categorical and the time to each reading is different in the sets but have a common start point at time 1. I have tried to modify the tmerge function from survival package but without luck as I don't have a dichotomous outcome variable nor a single data set with one row per patient.
Reprex for creating the datasets below (df1 and df2) and an example of my desired combined output table for a single patient (ID 3), output gets very long if done for all 4 patients
Thanks for any possible help
df1 <- structure(list(tstart = c(1, 1, 1, 1426, 1, 560, 567), tstop = c(2049,
3426, 1426, 1707, 560, 567, 4207), category = structure(c(1L,
1L, 1L, 2L, 1L, 4L, 2L), .Label = c("none", "high", "low", "moderate"
), class = "factor"), id = c(1L, 2L, 3L, 3L, 4L, 4L, 4L)), row.names = c(NA,
-7L), class = c("tbl_df", "tbl", "data.frame"))
df2 <- structure(list(tstart = c(1, 365, 730, 1, 365, 730, 1096, 2557,
1, 365, 730, 1096, 1826, 2557, 3652, 1), tstop = c(365, 730,
1096, 365, 730, 1096, 2557, 2582, 365, 730, 1096, 1826, 2557,
3652, 4864, 365), egfr = c(66, 62, 58, 54, 50, 43, 49, 51, 106,
103, 80, 92, 97, 90, 81, 51), id = c(1L, 1L, 1L, 2L,
2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 4L)), row.names = c(NA, -16L), class = c("tbl_df",
"tbl", "data.frame"))
df_example_patient_3 <- structure(list(id = c(3L, 3L, 3L,
3L, 3L, 3L,3L, 3L, 3L), tstart = c(1, 365, 730, 1096, 1426, 1707, 1826, 2557, 3652), tstop = c(365, 730,
1096, 1426, 1707, 1826, 2557, 3652, 4864), egfr = c(106, 103, 80, 92, 92, 92, 97, 90, 81), category = c("none", "none", "none", "none", "high", "high", "high", "high", "high")), row.names = c(NA, -9L), class = c("tbl_df",
"tbl", "data.frame"))
# DF1
tstart tstop category id
<dbl> <dbl> <fct> <int>
1 1 2049 none 1
2 1 3426 none 2
3 1 1426 none 3
4 1426 1707 high 3
5 1 560 none 4
6 560 567 moderate 4
7 567 4207 high 4
# DF2
tstart tstop egfr id
<dbl> <dbl> <dbl> <int>
1 1 365 66 1
2 365 730 62 1
3 730 1096 58 1
4 1 365 54 2
5 365 730 50 2
6 730 1096 43 2
7 1096 2557 49 2
8 2557 2582 51 2
9 1 365 106 3
10 365 730 103 3
11 730 1096 80 3
12 1096 1826 92 3
13 1826 2557 97 3
14 2557 3652 90 3
15 3652 4864 81 3
16 1 365 51 4
# Combined set
id tstart tstop egfr category
<int> <dbl> <dbl> <dbl> <chr>
1 3 1 365 106 none
2 3 365 730 103 none
3 3 730 1096 80 none
4 3 1096 1426 92 none
5 3 1426 1707 92 high
6 3 1707 1826 92 high
7 3 1826 2557 97 high
8 3 2557 3652 90 high
9 3 3652 4864 81 high
I had to do it this way to really work out the details.
First, i construct a full df1 with all the timestamps, including those of df2.
then i proceed with multiple merges. This is not elegant, but it works:
library(data.table)
library(zoo)
# Proper data.tables
setDT(df1, key = c("id", "tstart"))
setDT(df2, key = c("id", "tstart"))
timestamps_by_id <- unique(rbind(
df1[, .(id, tstart)],
df1[, .(id, tstop)],
df2[, .(id, tstart)],
df2[, .(id, tstop)],
use.names = F
))
setorder(timestamps_by_id, id, tstart)
# Merge to construct full df1
df1_full <- df1[timestamps_by_id]
df1_full[, category := na.locf(category), by = id]
df1_full[, tstop := shift(tstart, -1), by = id]
setkey(df1_full, id, tstart)
# Merge with df2
result <- na.omit(df2[df1_full, roll = T])
result[, tstop := i.tstop]
print(result[id == 3, .(id, tstart, tstop, egfr, category)])
Or a more data.tabley solution using the more arcane foverlaps:
library(data.table)
# Proper data.tables
setDT(df1, key = c("id", "tstart", "tstop"))
setDT(df2, key = c("id", "tstart", "tstop"))
# We add an infinite upper range
proper_df1 <- rbind(
df1,
df1[, .SD[which.max(tstop)], by = .(id)][, .(id, tstart = tstop, tstop = Inf, category), ]
)
setkey(proper_df1, id, tstart, tstop)
overlaps <- foverlaps(df2, proper_df1, type = "any") # Overlap join
overlaps[
tstart %between% .(i.tstart, i.tstop) & tstart != 1,
i.tstart := tstart
]
overlaps[tstop %between% .(i.tstart, i.tstop), i.tstop := tstop]
print(overlaps[
id == 3,
.(id, "tstart" = i.tstart, "tstop" = i.tstop, category, egfr)
])
This messy dplyr solution seems to work for this particular dataset but don't know would it work for all datasets, the direction of the fill may need to be altered depending on particular dataset
library(tidyverse)
library(magrittr)
df1 %>%
bind_rows(df2) %>%
group_by(id) %>%
arrange(id, tstop) %>%
mutate(
tstart = case_when(
tstart < lag(tstop) ~ lag(tstop), TRUE ~ tstart)) %>%
fill(egfr, category, .direction = "updown") %>%
ungroup() %>%
filter(id == 3)
tstart tstop category id egfr
<dbl> <dbl> <fct> <int> <dbl>
1 1 365 none 3 106
2 365 730 none 3 103
3 730 1096 none 3 80
4 1096 1426 none 3 92
5 1426 1707 high 3 92
6 1707 1826 high 3 92
7 1826 2557 high 3 97
8 2557 3652 high 3 90
9 3652 4864 high 3 81

Splitting full address column in multiple columns

I have a dataframe with the following column structure (over 1000+ rows total):
addressfull
POINT(3.124537653 32.179354012)||DEF_32||molengraaf 20, 1689 GL Utrecht, Netherlands||15||map
POINT(3.124537680 32.179354014)||DEF_32||winkellaan 67, 5788 BG Amsterdam, Netherlands||13||map
POINT(3.124537653 32.179354012)||DEF_32||vermeerstraat 18, 0932 DC Rotterdam, Netherlands||11||map
POINT(2.915206183 24.315583523)||DEF_32||--||13||map
POINT (2.900824999999923 34.3175721)||DEF_84||Zandhorstlaan 122, 0823 GT Ochtrup, Germany||17||map
structure(list(addressfull = structure(c(3L, 5L, 4L, 2L, 1L), .Label = c("POINT (2.900824999999923 34.3175721)||DEF_84||Zandhorstlaan 122, 0823 GT Ochtrup, Germany||17||map",
"POINT(2.915206183 24.315583523)||DEF_32||--||13||map", "POINT(3.124537653 32.179354012)||DEF_32||molengraaf 20, 1689 GL Utrecht, Netherlands||15||map",
"POINT(3.124537653 32.179354012)||DEF_32||vermeerstraat 18, 0932 DC Rotterdam, Netherlands||11||map",
"POINT(3.124537680 32.179354014)||DEF_32||winkellaan 67, 5788 BG Amsterdam, Netherlands||13||map"
), class = "factor")), class = "data.frame", row.names = c(NA,
-5L))
The column contains an location, street, housenumber, zip code, city, and country. I want to split the column addressfull with R in multiple columns, as example:
street house number zip city country
molengraaf 20 1689 GL Utrecht Netherlands
winkellaan 67 5788 BG Amsterdam Netherlands
vermeerstraat 18 0932 DC Rotterdam Netherlands
na na na na na
Zandhorstlaan 122 0823 GT Ochtrup Germany
I have readed the tidyr and stringr documentation. I can see an pattern for splitting (by ")", "| from position x", and ",". but i can't figure out the correct code to split the column into multiple columns.
Can someone help me?
You could brute force it using sub for a base R approach:
df$steet <- sub("^(\\S+)\\s+.*$", "\\1", df$adressfull)
df$`house number` <- sub("^\\S+\\s+(\\d+).*$", "\\1", df$adressfull)
df$zip <- sub("^\\S+\\s+\\d+,\\s*(\\d+\\s+[A-Z]+).*$", "\\1", df$adressfull)
df$city <- sub("^.*?(\\S+),\\s*\\S+$", "\\1", df$adressfull)
df$country <- sub("^.*,\\s*(\\S+)$", "\\1", df$adressfull)
df
adressfull steet house number zip
1 molengraaf 20, 1689 GL Utrecht, Netherlands molengraaf 20 1689 GL
city country
1 Utrecht Netherlands
Data:
df <- data.frame(adressfull=c("molengraaf 20, 1689 GL Utrecht, Netherlands"),
stringsAsFactors=FALSE)
This assumes that we have already isolated just the address text. To do that, conisder:
text <- "POINT(3.124537653 32.179354012)||DEF_32||molengraaf 20, 1689 GL Utrecht, Netherlands||15||map"
addresfull <- unlist(strsplit(text, "\\|\\|"))[3]
addresfull
[1] "molengraaf 20, 1689 GL Utrecht, Netherlands"
A stringrsolution is this:
addresssplit <- data.frame(
street = str_extract(addressfull$addressfull, "(?<=DEF_\\d{2}\\|\\|)\\w+\\b"),
number = str_extract(addressfull$addressfull, "\\d{1,}(?=,)"),
zip = str_extract(addressfull$addressfull, "(?<=\\s)\\d{4}\\s[A-Z]{2}"),
city = str_extract(addressfull$addressfull, "(?<=\\d{4}\\s[A-Z]{2}\\s)\\w+"),
country = str_extract(addressfull$addressfull, "(?<=[a-z]\\b,\\s)\\w+\\b")
)
RESULT:
addresssplit
street number zip city country
1 molengraaf 20 1689 GL Utrecht Netherlands
2 winkellaan 67 5788 BG Amsterdam Netherlands
3 vermeerstraat 18 0932 DC Rotterdam Netherlands
4 <NA> <NA> <NA> <NA> <NA>
5 Zandhorstlaan 122 0823 GT Ochtrup Germany
DATA:
addressfull <- structure(list(addressfull = structure(c(3L, 5L, 4L, 2L, 1L), .Label = c("POINT (2.900824999999923 34.3175721)||DEF_84||Zandhorstlaan 122, 0823 GT Ochtrup, Germany||17||map",
"POINT(2.915206183 24.315583523)||DEF_32||--||13||map", "POINT(3.124537653 32.179354012)||DEF_32||molengraaf 20, 1689 GL Utrecht, Netherlands||15||map",
"POINT(3.124537653 32.179354012)||DEF_32||vermeerstraat 18, 0932 DC Rotterdam, Netherlands||11||map",
"POINT(3.124537680 32.179354014)||DEF_32||winkellaan 67, 5788 BG Amsterdam, Netherlands||13||map"
), class = "factor")), class = "data.frame", row.names = c(NA,
-5L))
This would be a tidyverse approach to the problem:
library(tidyverse)
df <- structure(list(addressfull = structure(c(3L, 5L, 4L, 2L, 1L), .Label = c("POINT (2.900824999999923 34.3175721)||DEF_84||Zandhorstlaan 122, 0823 GT Ochtrup, Germany||17||map",
"POINT(2.915206183 24.315583523)||DEF_32||--||13||map", "POINT(3.124537653 32.179354012)||DEF_32||molengraaf 20, 1689 GL Utrecht, Netherlands||15||map",
"POINT(3.124537653 32.179354012)||DEF_32||vermeerstraat 18, 0932 DC Rotterdam, Netherlands||11||map",
"POINT(3.124537680 32.179354014)||DEF_32||winkellaan 67, 5788 BG Amsterdam, Netherlands||13||map"
), class = "factor")), class = "data.frame", row.names = c(NA,
-5L))
df %>% separate(addressfull, sep = "\\|\\|", into = c("Coords", "DEF", "ADDRESS"),extra = "drop") %>%
select(ADDRESS) %>%
separate(ADDRESS, sep = ",", into = c("street", "city", "country")) %>%
separate(street, sep = "(?= \\d)", into = c("street", "house_number")) %>%
separate(city, sep = "(?<=[A-Z][A-Z])", into = c("zip", "city"))
#> Warning: Expected 3 pieces. Missing pieces filled with `NA` in 1 rows [4].
#> Warning: Expected 2 pieces. Missing pieces filled with `NA` in 1 rows [4].
#> street house_number zip city country
#> 1 molengraaf 20 1689 GL Utrecht Netherlands
#> 2 winkellaan 67 5788 BG Amsterdam Netherlands
#> 3 vermeerstraat 18 0932 DC Rotterdam Netherlands
#> 4 -- <NA> <NA> <NA> <NA>
#> 5 Zandhorstlaan 122 0823 GT Ochtrup Germany
Created on 2020-02-27 by the reprex package (v0.3.0)

Replace unicode by its value in a dataframe

I tried to replace the unicode "U+00F3" from a data frame with the sapply function but nothing happened. The unicode part I want to replace is a chr type.
Here the function :
tableExcel$Team <- sapply(tableExcel$Team, gsub, pattern = "<U+00F3>", replacement= "o")
EDIT :
Thanks to the answer of Cath below, I added before the + : \\
tableExcel$Team <- sapply(tableExcel$Team, gsub, pattern = "<U\\+00F3>", replacement= "o")
But it didn't work.
I also tried to provide an exemple of my dataset but the problem is that it works on it and not on mine :
tableExcel <- data.frame("Team" = c("A", "B", "C", "Reducci<U+00F3>n"), "Point" = c(2, 30, 40, 30))
tableExcel$Team <- as.character(tableExcel$Team)
To provide more information, here the importation of my excel file:
tableExcel <- as.data.frame(read_excel("Dataset LOS.xls", sheet = "Liga Squads"))
The structure of my data :
structure(list(Team = c("CHURN", "CHURN", "RESIDENCIAL NPTB", "RESIDENCIAL NPTB", "AUDIENCIAS TV", "AUDIENCIAS TV"), Points = c("P. Asig", "P. entr", "P. Asig", "P. entr", "P. Asig", "P. entr"), 2019-S01 = c(0, 0, 50, 0, NA, NA), 2019-S02 = c(0, 0, 10, 10, NA, NA), 2019-S03 = c(93, 88, 46, 19, NA, NA), 2019-S04 = c(56, 48, 0, 0, 13, 13), 2019-S05 = c(NA, NA, 80.5, 49.5, 42, 28.5), 2019-S06 = c(NA, NA, 66, 48, 55, 39.5), 2019-S07 = c(131, 112, 103, 63, 40.5, 38)), row.names = c(1L, 2L, 4L, 5L, 7L, 8L), class = "data.frame")
I'm unable to replicate the issue with gsub. The following works as expected:
tableExcel$Team <- gsub("<U\\+00F3>", "o", tableExcel$Team)
#### OUTPUT ####
Team Points 2019-S01 2019-S02 2019-S03 2019-S04 2019-S05 2019-S06 2019-S07
1 Reducci<U+00F1>n P. Asig 0 0 93 56 NA NA 131.0
2 CHURN P. entr 0 0 88 48 NA NA 112.0
4 Reducci<U+00F2>n P. Asig 50 10 46 0 80.5 66.0 103.0
5 RESIDENCIAL NPTB P. entr 0 10 19 0 49.5 48.0 63.0
7 AUDIENCIAS TV P. Asig NA NA NA 13 42.0 55.0 40.5
8 <NA> P. entr NA NA NA 13 28.5 39.5 38.0
9 Reduccion P. entr NA NA NA NA NA NA NA
However, replacement using regular expressions might not be the most efficient way convert the unicode characters, as this would require multiple calls to gsub. Instead, you might want to give stringi's stri_unescape_unicode() a try:
# install.packages("stringi") # Use if not yet installed.
library(stringi)
tableExcel$Team <- stri_unescape_unicode(gsub("<U\\+(.*)>", "\\\\u\\1", tableExcel$Team))
#### OUTPUT ####
Team Points 2019-S01 2019-S02 2019-S03 2019-S04 2019-S05 2019-S06 2019-S07
1 Reducciñn P. Asig 0 0 93 56 NA NA 131.0
2 CHURN P. entr 0 0 88 48 NA NA 112.0
4 Reducciòn P. Asig 50 10 46 0 80.5 66.0 103.0
5 RESIDENCIAL NPTB P. entr 0 10 19 0 49.5 48.0 63.0
7 AUDIENCIAS TV P. Asig NA NA NA 13 42.0 55.0 40.5
8 <NA> P. entr NA NA NA 13 28.5 39.5 38.0
9 Reducción P. entr NA NA NA NA NA NA NA
The format <U+0000> is first converted to \\u0000 using gsub and then unescaped. As you can see, it takes care of multiple unicode characters in one go, which makes things much simpler.
Data:
tableExcel <- structure(list(Team = c("Reducci<U+00F1>n", "CHURN", "Reducci<U+00F2>n",
"RESIDENCIAL NPTB", "AUDIENCIAS TV", NA, "Reducci<U+00F3>n"),
Points = c("P. Asig", "P. entr", "P. Asig", "P. entr", "P. Asig",
"P. entr", "P. entr"), `2019-S01` = c(0, 0, 50, 0, NA, NA,
NA), `2019-S02` = c(0, 0, 10, 10, NA, NA, NA), `2019-S03` = c(93,
88, 46, 19, NA, NA, NA), `2019-S04` = c(56, 48, 0, 0, 13,
13, NA), `2019-S05` = c(NA, NA, 80.5, 49.5, 42, 28.5, NA),
`2019-S06` = c(NA, NA, 66, 48, 55, 39.5, NA), `2019-S07` = c(131,
112, 103, 63, 40.5, 38, NA)), row.names = c(1L, 2L, 4L, 5L,
7L, 8L, 9L), class = "data.frame")

How to draw multiple lines in R under leaflet?

I am having trouble drawing multiple lines in R using leaflet. I have a base map of New York City stations. I would like to add more information from the existing data set. The data set has columns: start_lng, start_lat, end_lng end_lat and total_trip. For each row, I would like to draw a line that connects the start point and the end point separately. Then the two stations will be connect, which stands for a trip. I hope to have one trip for each row. Plus, for coloring, the darkness of the line segments will be based on the total_trip. How would I be able to do that? Thanks.
leaflet(sample) %>%
addTiles() %>%
setView(-73.9,40.7, zoom = 11) %>%
addCircles(data = master_stations,lng = ~long, lat = ~lat, weight = 1, popup = ~name)
Here's part of my data set:
start.station.id start.station.longitude start.station.latitude end.station.longitude end.station.latitude total_trip
<dbl> <dbl> <dbl> <dbl> <dbl> <int>
1 72 -73.99393 40.76727 -74.00859 40.73620 2
2 72 -73.99393 40.76727 -73.99074 40.73455 2
3 72 -73.99393 40.76727 -73.97722 40.76341 2
4 72 -73.99393 40.76727 -73.98192 40.76527 2
5 79 -74.00667 40.71912 -73.98163 40.75206 2
6 79 -74.00667 40.71912 -73.98658 40.75514 2
7 79 -74.00667 40.71912 -73.98317 40.75527 2
8 79 -74.00667 40.71912 -73.98722 40.75300 2
9 83 -73.97632 40.68383 -73.97493 40.68981 4
10 83 -73.97632 40.68383 -73.98657 40.70149 2
# ... with 899 more rows
This is the full data set:
structure(list(start.station.id = c(72, 72, 72, 72, 79, 79),
end.station.id = c(238, 285, 352, 468, 153, 465), total_trip = c(2L,
2L, 2L, 2L, 2L, 2L), start.station.name = c("\"W 52 St & 11 Ave\"",
"\"W 52 St & 11 Ave\"", "\"W 52 St & 11 Ave\"", "\"W 52 St & 11 Ave\"",
"\"Franklin St & W Broadway\"", "\"Franklin St & W Broadway\""
), start.station.longitude = c(-73.99392888, -73.99392888,
-73.99392888, -73.99392888, -74.00666661, -74.00666661),
start.station.latitude = c(40.76727216, 40.76727216, 40.76727216,
40.76727216, 40.71911552, 40.71911552), end.station.name = c("\"Bank St & Washington St\"",
"\"Broadway & E 14 St\"", "\"W 56 St & 6 Ave\"", "\"Broadway & W 55 St\"",
"\"E 40 St & 5 Ave\"", "\"Broadway & W 41 St\""), end.station.longitude = c(-74.00859207,
-73.99074142, -73.97722479, -73.98192338, -73.9816324043,
-73.98658032), end.station.latitude = c(40.7361967, 40.73454567,
40.76340613, 40.7652654, 40.752062307, 40.75513557)), .Names = c("start.station.id",
"end.station.id", "total_trip", "start.station.name", "start.station.longitude",
"start.station.latitude", "end.station.name", "end.station.longitude",
"end.station.latitude"), row.names = c(NA, -6L), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"), vars = list(start.station.id), drop = TRUE, indices = list(
0:3, 4:5), group_sizes = c(4L, 2L), biggest_group_size = 4L, labels = structure(list(
start.station.id = c(72, 79)), row.names = c(NA, -2L), class = "data.frame", vars = list(
start.station.id), drop = TRUE, .Names = "start.station.id"))

Populating new variable from ddply within old data frame in R

I have a data.frame which looks like this (in reality 1M rows):
`> df
R.DMA.NAMES quarter daypart allpersons.imp rate station spot.id
1 Wilkes.Barre.Scranton.Hztn Q22014 afternoon 0.0 30 WSWB 13048713
2 Nashville Q12014 primetime 0.0 50 COM NASHVILLE 11969260
3 Seattle.Tacoma Q12014 primetime 6.1 51 ESPN SEATTLE, EVERETT ZONE 11898905
4 Jacksonville Q42013 late fringe 2.3 130 Jacksonville WAWS 11617447
5 Detroit Q22014 overnight 0.0 0 WKBD 12571421
6 South.Bend.Elkhart Q42013 primetime 11.5 325 WBND 11741171`
dput(df)
structure(list(R.DMA.NAMES = c("Wilkes.Barre.Scranton.Hztn",
"Nashville", "Seattle.Tacoma", "Jacksonville", "Detroit", "South.Bend.Elkhart"
), quarter = structure(c(3L, 1L, 1L, 6L, 3L, 6L), .Label = c("Q12014",
"Q22013", "Q22014", "Q32013", "Q32014", "Q42013"), class = "factor"),
daypart = c("afternoon", "primetime", "primetime", "late fringe",
"overnight", "primetime"), allpersons.imp = c(0, 0, 6.1,
2.3, 0, 11.5), rate = c(30, 50, 51, 130, 0, 325), station = c("WSWB",
"COM NASHVILLE", "ESPN SEATTLE, EVERETT ZONE", "Jacksonville WAWS",
"WKBD", "WBND"), spot.id = c(13048713L, 11969260L, 11898905L,
11617447L, 12571421L, 11741171L)), .Names = c("R.DMA.NAMES",
"quarter", "daypart", "allpersons.imp", "rate", "station", "spot.id"
), row.names = c(NA, -6L), class = "data.frame")
I am using a ddply function to perform a calculation:
ddply(df, .(R.DMA.NAMES, station, quarter), function (x) {
cpi = sum(df$rate) / sum(df$allpersons.imp)
})
This creates a new data.frame which looks like this:
R.DMA.NAMES station quarter V1
1 Detroit WKBD Q22014 NaN
2 Jacksonville Jacksonville WAWS Q42013 56.521739
3 Nashville COM NASHVILLE Q12014 Inf
4 Seattle.Tacoma ESPN SEATTLE, EVERETT ZONE Q12014 8.360656
5 South.Bend.Elkhart WBND Q42013 28.260870
6 Wilkes.Barre.Scranton.Hztn WSWB Q22014 Inf
What I'd like to do is create a new column called "cpi" in my original df i.e. the applicable "cpi" value should appear against the particular row. Of course, the same value will repeat many times i.e. 8.36 will appear for every row which contains "Seattle.Tacoma" for R.DMA.NAMES, "ESPN SEATTLE, EVERETT ZONE" for station and Q12014 for quarter. I tried several things including:
transform(df, cpi = ddply(df, .(R.DMA.NAMES, station, quarter), function (x) {
cpi = sum(df$rate) / sum(df$allpersons.imp)
})
But this didn't work ! Can someone explain . .
Use transform within ddply:
ddply(df, .(R.DMA.NAMES, station, quarter),
transform, cpi = sum(rate) / sum(allpersons.imp))

Resources