Converting values to special character in summary - r

I have created a summary table like below
Name Sales
AS 71.5%
DY 88.4%
VH 44.6%
MY 86.9%
HU 42.3%
TT 67.2%
BG 0.0%
SA 85.3%
now I want to replace the occurrence of 0.0 to "-"
I have tried
tab[,2] <- paste0(tab[,2],"%")
tab[,2] <- replace(tab[,2],tab[,2]<0,"-")
but its converting all values like 8.0 and 7.0 to "-"
do we have any other sollution
the output should be like
Name Sales
AS 71.5%
DY 88.4%
BG -
so the whole function is like this, have three columns of os sales for each person

You can try this:
#Data
df <- structure(list(Name = structure(c(1L, 3L, 8L, 5L, 4L, 7L, 2L,
6L), .Label = c("AS", "BG", "DY", "HU", "MY", "SA", "TT", "VH"
), class = "factor"), Sales = c(71.5, 88.4, 44.6, 86.9, 42.3,
67.2, 0, 85.3)), class = "data.frame", row.names = c(NA, -8L))
#Code
index <- which(df$Sales==0)
df$Sales[index] <- '-'
Name Sales
1 AS 71.5
2 DY 88.4
3 VH 44.6
4 MY 86.9
5 HU 42.3
6 TT 67.2
7 BG -
8 SA 85.3
Update with new data
New data has been provided:
df2 <- structure(list(Name = c("AS", "DY", "VH", "MY", "HU", "TT", "BG",
"SA"), Sales = c("71.5%", "88.4%", "44.6%", "86.9%", "42.3%",
"67.2%", "0.0%", "85.3%")), class = "data.frame", row.names = c(NA,
-8L))
df2$Sales2 <- gsub("0.0%","-",df2$Sales,fixed=T)
Name Sales Sales2
1 AS 71.5% 71.5%
2 DY 88.4% 88.4%
3 VH 44.6% 44.6%
4 MY 86.9% 86.9%
5 HU 42.3% 42.3%
6 TT 67.2% 67.2%
7 BG 0.0% -
8 SA 85.3% 85.3%
Update with variable
Using first data df:
df$tab <- paste0(df$Sales,'%')
df$tab <- ifelse(nchar(df$tab)==2,gsub("0%","-",df$tab,fixed=T),df$tab)
Name Sales tab
1 AS 71.5 71.5%
2 DY 88.4 88.4%
3 VH 44.6 44.6%
4 MY 86.9 86.9%
5 HU 42.3 42.3%
6 TT 67.2 67.2%
7 BG 0.0 -
8 SA 85.3 85.3%

Try this:
tab$Sales <- replace(tab$Sales, which(tab$Sales == 0), "-")
I'd also recommend looking into dplyr's mutate.

Related

Connecting two sets of coordinates to create lines using sf/mapview

I have a dataset where a bird captured in one location (Blong, Blat) then encountered again in another (Elong, Elat). These coordinates are in a lat/long format, and I'd like to connect the capture and encounter locations with a line and plot them in mapview.
In the data below, each row is an individual bird with its capture/encounter coordinates, and the flyway that it belongs to (which I would like to use to color the lines in mapview.
dat <- structure(list(Blong = c(-75.58333, -76.08333, -81.08333, -94.25,
-75.41667, -99.41667, -77.41667, -116.08333, -89.58333, -77.58333
), Blat = c(37.58333, 40.58333, 42.75, 41.91667, 38.25, 28.25,
38.91667, 43.58333, 44.25, 38.91667), Elong = c(-65.91667, -75.75,
-80.58333, -95.41667, -73.58333, -89.41667, -77.58333, -116.41667,
-96.41667, -77.41667), Elat = c(45.91667, 40.58333, 42.75, 29.75,
45.58333, 48.25, 38.75, 43.58333, 34.08333, 38.91667), Flyway = structure(c(2L,
2L, 2L, 1L, 2L, 2L, 2L, 3L, 2L, 2L), .Label = c("Central", "Eastern",
"West"), class = "factor")), .Names = c("Blong", "Blat", "Elong",
"Elat", "Flyway"), row.names = c(NA, -10L), class = c("tbl_df",
"tbl", "data.frame"))
A look at the data:
# A tibble: 10 x 5
Blong Blat Elong Elat Flyway
<dbl> <dbl> <dbl> <dbl> <fct>
1 -75.6 37.6 -65.9 45.9 Eastern
2 -76.1 40.6 -75.8 40.6 Eastern
3 -81.1 42.8 -80.6 42.8 Eastern
4 -94.2 41.9 -95.4 29.8 Central
5 -75.4 38.2 -73.6 45.6 Eastern
6 -99.4 28.2 -89.4 48.2 Eastern
7 -77.4 38.9 -77.6 38.8 Eastern
8 -116. 43.6 -116. 43.6 West
9 -89.6 44.2 -96.4 34.1 Eastern
10 -77.6 38.9 -77.4 38.9 Eastern
I've looked a few examples, but haven't found one that looks quite like my data set.
The tricky thing is to create a valid LINESTRING object from the coordinate pairs in wide format. sf expects linestring coordinates in rows of a matrix. Here's a way that works. The sfc column of a sf object is a list so here we use lapply to loop over the rows of the data you provided.
library(sf)
library(mapview)
b = dat[, c("Blong", "Blat")]
names(b) = c("long", "lat")
e = dat[, c("Elong", "Elat")]
names(e) = c("long", "lat")
dat$geometry = do.call(
"c",
lapply(seq(nrow(b)), function(i) {
st_sfc(
st_linestring(
as.matrix(
rbind(b[i, ], e[i, ])
)
),
crs = 4326
)
}))
dat_sf = st_as_sf(dat)
mapview(dat_sf, zcol = "Flyway")

Picking a number from vector and assign to column based on multiple conditions in R

I need to add a Thickness column to my Products table based on multiple conditions.
1 : Thickness should be only one of these values
Plate_Thickness <- c(5.8,25.1,27.1,32.5,55.6,98.1,120.4)
2 : Thickness should be between the ThicknessMin and ThicknessMax values already existing in table.
Current table looks like this:
Product ThicknessMin ThicknessMax
P0001 0 8
P0002 31.01 70
P0003 8.01 31
P0004 70.01 999
P0005 8.01 31
So, the idea is to pick a value for Thickness from the vector randomly but it should be between the ThicknessMin and ThicknessMax. Please help with any pointers how to go about this. Thanks.
A vectorized base R solution (df is your data.frame):
set.seed(1) #just for reproducibility
a<-findInterval(df$ThicknessMin,Plate_Thickness,all.inside=TRUE)
b<-findInterval(df$ThicknessMax,Plate_Thickness,all.inside=TRUE)
Plate_Thickness[runif(length(a)) %/% (1/(b-a+1))+a]
#[1] 5.8 32.5 25.1 98.1 5.8
Your data
Plate_Thickness <- c(5.8,25.1,27.1,32.5,55.6,98.1,120.4)
df <- structure(list(Product = c("P0001", "P0002", "P0003", "P0004",
"P0005"), ThicknessMin = c(0, 31.01, 8.01, 70.01, 8.01), ThicknessMax = c(8L,
70L, 31L, 999L, 31L), Plate_Thickness = c(5.8, 32.5, 27.1, 120.4,
25.1)), .Names = c("Product", "ThicknessMin", "ThicknessMax",
"Plate_Thickness"), row.names = c(NA, -5L), class = c("data.table",
"data.frame"))
solution
library(dplyr)
acceptable_vals <- lapply(1:nrow(df), function(x) Plate_Thickness[between(Plate_Thickness, df$ThicknessMin[x], df$ThicknessMax[x])])
set.seed(1)
df$Plate_Thickness <- sapply(acceptable_vals, function(x) x[sample(1:length(x), 1)])
Output
Product ThicknessMin ThicknessMax Plate_Thickness
1: P0001 0.00 8 5.8
2: P0002 31.01 70 32.5
3: P0003 8.01 31 27.1
4: P0004 70.01 999 120.4
5: P0005 8.01 31 25.1
We can use the rowwise function from the dplyr package to sample from the Plate_Thickness vector. Within the call to sample, we sample only from elements of Plate_Thickness which are between ThicknessMin and ThicknessMax. I put your table in a data.frame called dat:
library(dplyr)
set.seed(123)
dat %>%
rowwise() %>%
mutate(thick_sample = sample(Plate_Thickness[between(Plate_Thickness, ThicknessMin, ThicknessMax)],
1))
Product ThicknessMin ThicknessMax thick_sample
<fctr> <dbl> <int> <dbl>
1 P0001 0.00 8 2.0
2 P0002 31.01 70 55.6
3 P0003 8.01 31 25.1
4 P0004 70.01 999 120.4
5 P0005 8.01 31 27.1
Data (for reproducibility)
dat <- structure(list(Product = structure(1:5, .Label = c("P0001", "P0002",
"P0003", "P0004", "P0005"), class = "factor"), ThicknessMin = c(0,
31.01, 8.01, 70.01, 8.01), ThicknessMax = c(8L, 70L, 31L, 999L,
31L)), .Names = c("Product", "ThicknessMin", "ThicknessMax"), class = "data.frame", row.names = c(NA,
-5L))
#DATA
df = structure(list(Product = c("P0001", "P0002", "P0003", "P0004",
"P0005"), ThicknessMin = c(0, 31.01, 8.01, 70.01, 8.01), ThicknessMax = c(8L,
70L, 31L, 999L, 31L)), .Names = c("Product", "ThicknessMin",
"ThicknessMax"), class = c("data.table", "data.frame"), row.names = c(NA,
-5L))
Plate_Thickness = c(5.8,25.1,27.1,32.5,55.6,98.1,120.4)
set.seed(1)
apply(X = df[c("ThicknessMin", "ThicknessMax")],
MARGIN = 1, #Run FUN on each row of X
FUN = function(x)
#Retain only eligible values for each row and sample 1 value
sample(x = Plate_Thickness[Plate_Thickness > x[1] & Plate_Thickness < x[2]],
size = 1))
#[1] 2.0 32.5 27.1 120.4 25.1

Grouping valuable with barplot in R studio

Income1.csv
Age.Group X X.1 X.2 X.3 X.4
1 Income 16-24 25-34 35-44 45-54 55+
2 Low 13.9 17.4 14.9 11.9 10.9
3 Medium 26.3 46.9 42.2 30.7 21.5
4 High 11.6 19.7 22.4 17.4 6.7
How do you create a grouped barplot with the height as Age? The picture below is what I want to create.
Read your data:
d <- dput(d)
structure(list(Income = structure(c(2L, 3L, 1L), .Label = c("High",
"Low", "Medium"), class = "factor"), `16-24` = c(13.9, 26.3,
11.6), `25-34` = c(17.4, 46.9, 19.7), `35-44` = c(14.9, 42.2,
22.4), `45-54` = c(11.9, 30.7, 17.4), `55+` = c(10.9, 21.5, 6.7
)), .Names = c("Income", "16-24", "25-34", "35-44", "45-54",
"55+"), class = "data.frame", row.names = c(NA, -3L))
Plot your data: beside specifies that the values are plotted beside not stacked.
barplot(as.matrix(d[,-1]), beside = T, legend.text = d$Income)

grouping, comparing, and counting rows in r

I have a data frame that looks as the following:
system Id initial final
665 9 16001 6070 6071
683 10 16001 6100 6101
696 11 16001 6101 6113
712 10 16971 6150 6151
715 11 16971 6151 6163
4966 7 4118 10238 10242
5031 9 4118 10260 10278
5088 10 4118 10279 10304
5115 11 4118 10305 10317
structure(list(system = c(9L, 10L, 11L, 10L, 11L, 7L, 9L, 10L,
11L), Id = c(16001L, 16001L, 16001L, 16971L, 16971L, 4118L, 4118L,
4118L, 4118L), initial = c(6070, 6100, 6101, 6150, 6151, 10238,
10260, 10279, 10305), final = c(6071, 6101, 6113, 6151, 6163,
10242, 10278, 10304, 10317)), .Names = c("system", "Id", "initial",
"final"), row.names = c(665L, 683L, 696L, 712L, 715L, 4966L,
5031L, 5088L, 5115L), class = "data.frame")
I would like to get a new data frame with the next structure
Id system length initial final
1 16001 9,10,11 3 6070 6113
2 16971 10,11 2 6150 6163
3 4118 7 1 10238 10242
4 4118 9,10,11 3 10260 10317
structure(list(Id = c(16001L, 16971L, 4118L, 4118L), system = structure(c(3L,
1L, 2L, 3L), .Label = c("10,11", "7", "9,10,11"), class = "factor"),
length = c(3L, 2L, 1L, 3L), initial = c(6070L, 6150L, 10238L,
10260L), final = c(6113, 6163, 10242, 10317)), .Names = c("Id",
"system", "length", "initial", "final"), class = "data.frame", row.names = c(NA,
-4L))
The grouping is by Id and the difference (between rows) in "system" field equal to one. Also I would like to get the different "system" and how many of that involved in grouping. Finally a column with the first "initial" and the last "final" involved also.
It is possible to do that in r?
Thanks.
You could use data.table. Convert "data.frame" to "data.table" (setDT), create a grouping variable "indx" by taking the difference of adjacent elements of "system" (diff(system)), cumsum the logical vector, use "Id" and "indx" as grouping variable to get the statistics.
library(data.table)
setDT(df)[,list(system=toString(system), length=.N, initial=initial[1L],
final=final[.N]), by=list(Id,indx=cumsum(c(TRUE, diff(system)!=1)))][,
indx:=NULL][]
# Id system length initial final
#1: 16001 9, 10, 11 3 6070 6113
#2: 16971 10, 11 2 6150 6163
#3: 4118 7 1 10238 10242
#4: 4118 9, 10, 11 3 10260 10317
Or based on #jazzurro's comment about using first/last functions from dplyr,
library(dplyr)
df %>%
group_by(indx=cumsum(c(TRUE, diff(system)!=1)), Id) %>%
summarise(system=toString(system), length=n(),
initial=first(initial), final=last(final))
A solution without data.table, but plyr:
library(plyr)
func = function(subdf)
{
bool = c(diff(subdf$system),1)==1
ldply(split(subdf, bool), function(u){
data.frame(system = paste(u$system, collapse=','),
Id = unique(u$Id),
length = nrow(u),
initial= head(u,1)$initial,
final = tail(u,1)$final)
})
}
ldply(split(df, df$Id), func)
# .id system length Id initial final
#1 FALSE 7 1 4118 10238 10242
#2 TRUE 9,10,11 3 4118 10260 10317
#3 TRUE 9,10,11 3 16001 6070 6113
#4 TRUE 10,11 2 16971 6150 6163

Populating new variable from ddply within old data frame in R

I have a data.frame which looks like this (in reality 1M rows):
`> df
R.DMA.NAMES quarter daypart allpersons.imp rate station spot.id
1 Wilkes.Barre.Scranton.Hztn Q22014 afternoon 0.0 30 WSWB 13048713
2 Nashville Q12014 primetime 0.0 50 COM NASHVILLE 11969260
3 Seattle.Tacoma Q12014 primetime 6.1 51 ESPN SEATTLE, EVERETT ZONE 11898905
4 Jacksonville Q42013 late fringe 2.3 130 Jacksonville WAWS 11617447
5 Detroit Q22014 overnight 0.0 0 WKBD 12571421
6 South.Bend.Elkhart Q42013 primetime 11.5 325 WBND 11741171`
dput(df)
structure(list(R.DMA.NAMES = c("Wilkes.Barre.Scranton.Hztn",
"Nashville", "Seattle.Tacoma", "Jacksonville", "Detroit", "South.Bend.Elkhart"
), quarter = structure(c(3L, 1L, 1L, 6L, 3L, 6L), .Label = c("Q12014",
"Q22013", "Q22014", "Q32013", "Q32014", "Q42013"), class = "factor"),
daypart = c("afternoon", "primetime", "primetime", "late fringe",
"overnight", "primetime"), allpersons.imp = c(0, 0, 6.1,
2.3, 0, 11.5), rate = c(30, 50, 51, 130, 0, 325), station = c("WSWB",
"COM NASHVILLE", "ESPN SEATTLE, EVERETT ZONE", "Jacksonville WAWS",
"WKBD", "WBND"), spot.id = c(13048713L, 11969260L, 11898905L,
11617447L, 12571421L, 11741171L)), .Names = c("R.DMA.NAMES",
"quarter", "daypart", "allpersons.imp", "rate", "station", "spot.id"
), row.names = c(NA, -6L), class = "data.frame")
I am using a ddply function to perform a calculation:
ddply(df, .(R.DMA.NAMES, station, quarter), function (x) {
cpi = sum(df$rate) / sum(df$allpersons.imp)
})
This creates a new data.frame which looks like this:
R.DMA.NAMES station quarter V1
1 Detroit WKBD Q22014 NaN
2 Jacksonville Jacksonville WAWS Q42013 56.521739
3 Nashville COM NASHVILLE Q12014 Inf
4 Seattle.Tacoma ESPN SEATTLE, EVERETT ZONE Q12014 8.360656
5 South.Bend.Elkhart WBND Q42013 28.260870
6 Wilkes.Barre.Scranton.Hztn WSWB Q22014 Inf
What I'd like to do is create a new column called "cpi" in my original df i.e. the applicable "cpi" value should appear against the particular row. Of course, the same value will repeat many times i.e. 8.36 will appear for every row which contains "Seattle.Tacoma" for R.DMA.NAMES, "ESPN SEATTLE, EVERETT ZONE" for station and Q12014 for quarter. I tried several things including:
transform(df, cpi = ddply(df, .(R.DMA.NAMES, station, quarter), function (x) {
cpi = sum(df$rate) / sum(df$allpersons.imp)
})
But this didn't work ! Can someone explain . .
Use transform within ddply:
ddply(df, .(R.DMA.NAMES, station, quarter),
transform, cpi = sum(rate) / sum(allpersons.imp))

Resources