My data looks like this:
outc_n state region
largezip1_outc 96 FL 1
largezip2_outc 74 FL 2
largezip3_outc 251 FL 3
largezip4_outc 176 FL 4
largezip5_outc 161 FL 5
largezip6_outc 95 FL 6
How can I automatically create labels for region so that it looks like this?
outc_n state region
largezip1_outc 96 FL FL-L-1
largezip2_outc 74 FL FL-L-2
largezip3_outc 251 FL FL-L-3
largezip4_outc 176 FL FL-L-4
largezip5_outc 161 FL FL-L-5
largezip6_outc 95 FL FL-L-6
thanks!
library(dplyr)
dat <-
read.table(text = " outc_n state region
largezip1_outc 96 FL 1
largezip2_outc 74 FL 2
largezip3_outc 251 FL 3
largezip4_outc 176 FL 4
largezip5_outc 161 FL 5
largezip6_outc 95 FL 6")
dat %>%
mutate(region = paste(state, "L", region, sep = "-"))
You could change what goes in place of the L in the paste() statement programatically with some minor edits.
Assuming that you want to add L because the first letter of the row names are L, we can use paste to concatenate different strings.
dat$region <- with(dat, paste(state,
toupper(substring(rownames(dat), 1, 1)),
region,
sep = "-"))
dat
# outc_n state region
# largezip1_outc 96 FL FL-L-1
# largezip2_outc 74 FL FL-L-2
# largezip3_outc 251 FL FL-L-3
# largezip4_outc 176 FL FL-L-4
# largezip5_outc 161 FL FL-L-5
# largezip6_outc 95 FL FL-L-6
DATA
dat <- read.table(text = " outc_n state region
largezip1_outc 96 FL 1
largezip2_outc 74 FL 2
largezip3_outc 251 FL 3
largezip4_outc 176 FL 4
largezip5_outc 161 FL 5
largezip6_outc 95 FL 6",
header = TRUE, stringsAsFactors = FALSE)
Using data.table:
df <- read.table(header=T, text="large outc_n state region
largezip1_outc 96 FL 1
largezip2_outc 74 FL 2
largezip3_outc 251 FL 3
largezip4_outc 176 FL 4
largezip5_outc 161 FL 5
largezip6_outc 95 FL 6")
library(data.table)
setDT(df)
## using paste0 to paste different values of columns desired and getting
## first letter of first column and converting it to upper case as desired
df[, region := paste0(state, "-", toupper(substr(large, 1, 1)), "-", region)]
large outc_n state region
1: largezip1_outc 96 FL FL-L-1
2: largezip2_outc 74 FL FL-L-2
3: largezip3_outc 251 FL FL-L-3
4: largezip4_outc 176 FL FL-L-4
5: largezip5_outc 161 FL FL-L-5
6: largezip6_outc 95 FL FL-L-6
Related
I am trying to merge two dataframes in r, and this error message keeps coming up even though the variable types all should be correct.
Here is my code:
team_info <- baseballr::mlb_teams(season = 2022)
team_info_mlb <- subset(team_info, sport_name == 'Major League Baseball')
tim2 <- team_info_mlb %>%
rename('home_team' = club_name)
tim3 <- subset(tim2, select = c('team_full_name', 'home_team'))
new_pf <- baseballr::fg_park(yr = 2022)
new_pf <- subset(new_pf, select = c('home_team', '1yr'))
info_pf <- merge(tim3, new_pf, by = 'home_team')
The final line is where the problems happen. Let me know if anyone has advice.
The problem is that the data have some fancy class attributes.
> class(tim3)
[1] "baseballr_data" "tbl_df" "tbl" "data.table" "data.frame"
> class(new_pf)
[1] "baseballr_data" "tbl_df" "tbl" "data.table" "data.frame"
Just wrap them in as.data.frame(). Since both data sets have the same by variable you may omit explicit specification.
info_pf <- merge(as.data.frame(tim3), as.data.frame(new_pf))
info_pf
# home_team team_full_name 1yr
# 1 Angels Los Angeles Angels 102
# 2 Astros Houston Astros 99
# 3 Athletics Oakland Athletics 94
# 4 Blue Jays Toronto Blue Jays 106
# 5 Braves Atlanta Braves 105
# 6 Brewers Milwaukee Brewers 102
# 7 Cardinals St. Louis Cardinals 92
# 8 Cubs Chicago Cubs 103
# 9 Diamondbacks Arizona Diamondbacks 103
# 10 Dodgers Los Angeles Dodgers 98
# 11 Giants San Francisco Giants 99
# 12 Guardians Cleveland Guardians 97
# 13 Mariners Seattle Mariners 94
# 14 Marlins Miami Marlins 97
# 15 Mets New York Mets 91
# 16 Nationals Washington Nationals 97
# 17 Orioles Baltimore Orioles 108
# 18 Padres San Diego Padres 96
# 19 Phillies Philadelphia Phillies 98
# 20 Pirates Pittsburgh Pirates 101
# 21 Rangers Texas Rangers 98
# 22 Rays Tampa Bay Rays 89
# 23 Red Sox Boston Red Sox 111
# 24 Reds Cincinnati Reds 112
# 25 Rockies Colorado Rockies 112
# 26 Royals Kansas City Royals 108
# 27 Tigers Detroit Tigers 94
# 28 Twins Minnesota Twins 99
# 29 White Sox Chicago White Sox 100
# 30 Yankees New York Yankees 99
My code works 95% correctly but I am not sure why my graph has empty white spaces for certain states. For example Washington state has a count of 152 but it appears NULL?
txt <- "AK AL AR AZ CA CO CT DC DE FL GA HI IA ID IL IN KS KY LA MA MD ME MI MN MO MS MT NC ND NE NH NJ NM NV NY OH OK OR PA RI SC SD TN TX UT VA VT
34 103 78 241 789 200 18 18 13 355 210 26 36 48 119 106 57 98 104 32 81 26 92 62 136 65 34 164 10 30 16 70 107 100 109 151 150 97 113 3 90 15 158 479 68 95 7
WA WI WV WY
152 96 48 14 "
dat <- stack(read.table(text = txt, header = TRUE, fill = TRUE))
names(dat)[2] <-'state.abb'
dat$states <- tolower(state.name[match(dat$state.abb, state.abb)])
mapUSA <- map('state', fill = TRUE, plot = FALSE)
nms <- sapply(strsplit(mapUSA$names, ':'), function(x)x[1])
USApolygons <- map2SpatialPolygons(mapUSA, IDs = nms, CRS('+proj=longlat'))
idx <- match(unique(nms), dat$states)
dat2 <- data.frame(value = dat$value[idx], state = unique(nms))
row.names(dat2) <- unique(nms)
USAsp <- SpatialPolygonsDataFrame(USApolygons, data = dat2)
spplot(USAsp['value'], main = "Armed Males with an Attack Threat Level", sub = "Count Per State", col="transparent")
I'm trying to 're-count' a column in R and having issues by cleaning up the data. I'm working on cleaning data by location and once I change CA to California.
all_location <- read.csv("all_location.csv", stringsAsFactors = FALSE)
all_location <- count(all_location, location)
all_location <- all_location[with(all_location, order(-n)), ]
all_location
A tibble: 100 x 2
location n
<chr> <int>
1 CA 3216
2 Alaska 2985
3 Nevada 949
4 Washington 253
5 Hawaii 239
6 Montana 218
7 Puerto Rico 149
8 California 126
9 Utah 83
10 NA 72
From the above, there's CA and California. Below I'm able to clean grep and replace CA with California. However, my issue is that it's grouping by California but shows two separate instances of California.
ca1 <- grep("CA",all_location$location)
all_location$location <- replace(all_location$location,ca1,"California")
all_location
A tibble: 100 x 2
location n
<chr> <int>
1 California 3216
2 Alaska 2985
3 Nevada 949
4 Washington 253
5 Hawaii 239
6 Montana 218
7 Puerto Rico 149
8 California 126
9 Utah 83
10 NA 72
My goal would be to combine both to a total under n.
all_location$location[substr(all_location$location, 1, 5) %in% "Calif" ] <- "California"
to make sure everything that starts with "Calif" gets made into "California"
I am assuming that maybe you have a space in the California (e.g. "California ") that is already present which is why this is happening..
I have this txt file and i would like to read this in R with this command:
read.table("C:/users/vatlidak/My Documents/Documents/Untitled.txt", header=TRUE)
R returns me the following command:
"more columns than column name"
txt file:
height Shoesize gender Location
1 181 44 male city center
4 170 43 female city center
5 172 43 female city center
13 175 42 male out of city
14 181 44 male out of city
15 180 43 male out of city
16 177 43 female out of city
17 133 41 male out of city
If myFile contains the path/filename then replace each of the first 4 stretches of whitespace on every line with a comma and then re-read using read.csv. No packages are used.
L <- readLines(myFile) ##
for(i in 1:4) L <- sub("\\s+", ",", L)
DF <- read.csv(text = L)
giving:
> DF
height Shoesize gender Location
1 181 44 male city center
4 170 43 female city center
5 172 43 female city center
13 175 42 male out of city
14 181 44 male out of city
15 180 43 male out of city
16 177 43 female out of city
17 133 41 male out of city
Note: For purposes of testing we can use this in place of the line marked ## above. (Note that SO can introduce spaces at the beginnings of the lines so we remove them.)
Lines <- " height Shoesize gender Location
1 181 44 male city center
4 170 43 female city center
5 172 43 female city center
13 175 42 male out of city
14 181 44 male out of city
15 180 43 male out of city
16 177 43 female out of city
17 133 41 male out of city"
L <- readLines(textConnection(Lines))
L[-1] <- sub("^\\s+", "", L[-1])
It is a bit late but i had the same problem and i tried them but i did not work on my dataset, than i just converted csv file into xlsx file and it worked without any extra operation. Like,
library(gdata)
df <- read.xls(file, sheet = 1, row.names=1)
This may help for the future readers.
I am new to plyr (and R) and looking for a little help to get started. Using the baseball dataset as an exaple, how could I calculate the year-over-year (yoy) change in "at batts" by league and team (lg and team)?
library(plyr)
df1 <- aggregate(ab~year+lg+team, FUN=sum, data=baseball)
After doing a little aggregating to simplify the data fame, the data looks like this:
head(df1)
year lg team ab
1884 UA ALT 108
1997 AL ANA 1703
1998 AL ANA 1502
1999 AL ANA 660
2000 AL ANA 85
2001 AL ANA 219
I would like to end up with someting like this
year lg team ab yoy
1997 AL ANA 1703 NA
1998 AL ANA 1502 -201
1999 AL ANA 660 -842
2000 AL ANA 85 -575
2001 AL ANA 219 134
I started by writign the following function, which I think is wrong:
yoy.func <- function(df) {
lag <- c(df$ab[-1],0)
cur <- c(df$ab[1],0)
df$yoy <- cur -lag
return(df)
}
Without sucess, I used the following code to attempt return the yoy change.
df2 <- ddply(df1, .(lg, team), yoy.func)
Any guidance woud be appreciated.
Thanks
I know you asked for a "plyr"-specific solution, but for the sake of sharing, here is an alternative approach in base R. In my opinion, I find the base R approach just as "readable". And, at least in this particular case, it's a lot faster!
output <- within(df1, {
yoy <- ave(ab, team, lg, FUN = function(x) c(NA, diff(x)))
})
head(output)
# year lg team ab yoy
# 1 1884 UA ALT 108 NA
# 2 1997 AL ANA 1703 NA
# 3 1998 AL ANA 1502 -201
# 4 1999 AL ANA 660 -842
# 5 2000 AL ANA 85 -575
# 6 2001 AL ANA 219 134
library(rbenchmark)
benchmark(DDPLY = {
ddply(df1, .(team, lg), mutate ,
yoy = c(NA, diff(ab)))
}, WITHIN = {
within(df1, {
yoy <- ave(ab, team, lg, FUN = function(x) c(NA, diff(x)))
})
}, columns = c("test", "replications", "elapsed",
"relative", "user.self"))
# test replications elapsed relative user.self
# 1 DDPLY 100 10.675 4.974 10.609
# 2 WITHIN 100 2.146 1.000 2.128
Update: data.table
If your data are very large, check out data.table. Even with this example, you'll find a good speedup in relative terms. Plus the syntax is super compact and, in my opinion, easily readable.
library(plyr)
df1 <- aggregate(ab~year+lg+team, FUN=sum, data=baseball)
library(data.table)
DT <- data.table(df1)
DT
# year lg team ab
# 1: 1884 UA ALT 108
# 2: 1997 AL ANA 1703
# 3: 1998 AL ANA 1502
# 4: 1999 AL ANA 660
# 5: 2000 AL ANA 85
# ---
# 2523: 1895 NL WSN 839
# 2524: 1896 NL WSN 982
# 2525: 1897 NL WSN 1426
# 2526: 1898 NL WSN 1736
# 2527: 1899 NL WSN 787
Now, look at this concise solution:
DT[, yoy := c(NA, diff(ab)), by = "team,lg"]
DT
# year lg team ab yoy
# 1: 1884 UA ALT 108 NA
# 2: 1997 AL ANA 1703 NA
# 3: 1998 AL ANA 1502 -201
# 4: 1999 AL ANA 660 -842
# 5: 2000 AL ANA 85 -575
# ---
# 2523: 1895 NL WSN 839 290
# 2524: 1896 NL WSN 982 143
# 2525: 1897 NL WSN 1426 444
# 2526: 1898 NL WSN 1736 310
# 2527: 1899 NL WSN 787 -949
How about using diff():
df <- read.table(header = TRUE, text = ' year lg team ab
1884 UA ALT 108
1997 AL ANA 1703
1998 AL ANA 1502
1999 AL ANA 660
2000 AL ANA 85
2001 AL ANA 219')
require(plyr)
ddply(df, .(team, lg), mutate ,
yoy = c(NA, diff(ab)))
# year lg team ab yoy
1 1884 UA ALT 108 NA
2 1997 AL ANA 1703 NA
3 1998 AL ANA 1502 -201
4 1999 AL ANA 660 -842
5 2000 AL ANA 85 -575
6 2001 AL ANA 219 134