Assuming the following dataset:
df <- structure(list(id = 1:9, city = structure(c(1L, 7L, 2L, 6L, 4L,
9L, 3L, 8L, 5L), .Label = c("bj", "gz", "lz", "nj", "sh", "sz",
"tj", "wh", "xa"), class = "factor")), class = "data.frame", row.names = c(NA,
-9L))
How could create a new column direction based on conditions:
if city is in list ['bj', 'tj'], then returns north for direction, if in ['sz', 'nj', 'sh'] returns east, if in ['xa', 'lz'] returns west, if in ['wh'] returns center, if in ['gz', 'sz'] returns south.
The expected result will like this:
My code:
df %>%
filter(city %in% c('bj', 'tj')) %>%
mutate(direction = 'north')
Out:
Use case_when :
library(dplyr)
df %>%
mutate(direction = case_when(city %in% c('bj', 'tj') ~ 'north',
city %in% c('sz', 'nj', 'sh') ~ 'east',
city %in% c('xa', 'lz') ~ 'west',
city %in% c('wh') ~ 'center',
city %in% c('gz', 'sz') ~ 'south',
))
# id city direction
#1 1 bj north
#2 2 tj north
#3 3 gz south
#4 4 sz east
#5 5 nj east
#6 6 xa west
#7 7 lz west
#8 8 wh center
#9 9 sh east
You can do it in an easy way using basic R data.frame manipulation:
df$direction <- ""
df[df$city %in% c('bj', 'tj'), "direction"] <- "north"
df[df$city %in% c('sz', 'nj', 'sh'),"direction"] <- "east"
df[df$city %in% c('xa', 'lz'), "direction"] <- "west"
df[df$city %in% c('wh'), "direction"] <- "center"
df[df$city %in% c('gz', 'sz'), "direction"] <- "south"
df
id city direction
1 1 bj north
2 2 tj north
3 3 gz south
4 4 sz south
5 5 nj east
6 6 xa west
7 7 lz west
8 8 wh center
9 9 sh east
Using nested ifelse statements can do the job as well.
df$direction=ifelse(df$city %in% c("bj","tj"), yes = "north",
ifelse(df$city %in% c('sz', 'nj', 'sh'), yes = "east",
ifelse(df$city %in% c("xa", "lz"), yes = "west",
ifelse(df$city %in% c("gz", "sz"), yes = "south", no = "center"))))
You can try stack to create a dictionary first and then match the cities, e.g.,
d <- stack(
list(
north = c("bj", "tj"),
east = c("sz", "nj", "sh"),
west = c("xa", "lz"),
center = "wh",
south = c("gz", "sz")
)
)
df <- transform(
df,
direction = d$ind[match(city,d$values)]
)
which gives
id city direction
1 1 bj north
2 2 tj north
3 3 gz south
4 4 sz east
5 5 nj east
6 6 xa west
7 7 lz west
8 8 wh center
9 9 sh east
I have also tried another sulution with mutate but the error message is the same about the wrong symbol
Here I loaded the dplyr library first.
library(dplyr)
new_soccer_referee %>%
mutate(postion_new = case_when (position %in% c("Right Fullback", "Left Fullback", "Center Back", "Defensive Midfielder") ~ "Defense",
position %in% c("Right Midfielder", "Left Midfielder", "Center Midfielder") ~ "Midfield",
position %in% c("Attacking Midfielder", "Right Winger", "Left Winger", "Center Forward") ~ "Offense",
))
Related
I have a large DF with certain columns that have a vector of character values as below. The number of columns varies from dataset to dataset as well as the number of character vectors it holds also varies.
ID Country1 Country2 Country3
1 1 Argentina, Japan,USA,Poland, Argentina,USA Pakistan
2 2 Colombia, Mexico,Uruguay,Dutch Mexico,Uruguay Afganisthan
3 3 Argentina, Japan,USA,NA Japan Khazagistan
4 4 Colombia, Mexico,Uruguay,Dutch Colombia, Dutch North Korea
5 5 India, China China Iran
Would like to match them one-to-one with another string vector as below
vals_to_find <-c("Argentina","USA","Mexico")
If, a column/row matches to anyone of the strings passed would like to retain that column and row. Remove duplicates, and finally remove those values that do not match.
the desired output is as follows
ID Countries.found
1 1 Argentina, USA
2 2 Mexico
3 3 Argentina, USA
4 4 Mexico
data
dput(df)
structure(list(ID = 1:5, Country1 = c("Argentina, Japan,USA,Poland,",
"Colombia, Mexico,Uruguay,Dutch", "Argentina, Japan,USA,NA",
"Colombia, Mexico,Uruguay,Dutch", "India, China"), Country2 = c("Argentina,USA",
"Mexico,Uruguay", "Japan", "Colombia, Dutch", "China"), Country3 = c("Pakistan",
"Afganisthan", "Khazagistan", "North Korea", "Iran")), class = "data.frame", row.names = c(NA,
-5L))
dput(df_out)
structure(list(ID = 1:4, Countries.found = c("Argentina, USA",
"Mexico", "Argentina, USA", "Mexico")), class = "data.frame", row.names = c(NA,
-4L))
Instead of a each column as a vector, if the file is read as one value per column. Then, was able do it as below
dput(df_out)
structure(list(ID = 1:5, X1 = c("Argentina", "Colombia", "Argentina",
"Colombia", "India"), X2 = c("Japan", "Mexico", "Japan", "Mexico",
"China"), X3 = c("USA", "Uruguay", "USA", "Uruguay", NA), X4 = c("Poland",
"Dutch", NA, "Dutch", NA), X5 = c("Argentina", "Mexico", "Japan",
"Colombia", "China"), X6 = c("USA", "Uruguay", NA, "Dutch", NA
), X7 = c("Pakistan", "Afganisthan", "Khazagistan", "North Korea",
"Iran")), class = "data.frame", row.names = c(NA, -5L))
df_out %>%
dplyr::select(
where(~ !all(is.na(.x)))
) %>%
dplyr::select(c(1, where(~ any(.x %in% vals_to_find)))) %>%
dplyr::mutate(dplyr::across(
tidyselect::starts_with("X"),
~ vals_to_find[match(., vals_to_find)]
)) %>%
tidyr::unite("countries_found", tidyselect::starts_with("X"),
sep = " | ", remove = TRUE, na.rm = TRUE
)
Output
ID countries_found
1 1 Argentina | USA | Argentina | USA
2 2 Mexico | Mexico
3 3 Argentina | USA
4 4 Mexico
unite the "Country" columns, then create a long vector by separating the values into rows, get all distinct values per ID, filter only those who are in vals_to_find, and summarise each countries.found toString.
library(tidyr)
library(dplyr)
df %>%
unite("Country", starts_with("Country"), sep = ",") %>%
separate_rows(Country) %>%
distinct(ID, Country) %>%
filter(Country %in% vals_to_find) %>%
group_by(ID) %>%
summarise(Countries.found = toString(Country))
output
# A tibble: 4 × 2
ID Countries.found
<int> <chr>
1 1 Argentina, USA
2 2 Mexico
3 3 Argentina, USA
4 4 Mexico
We may use
library(dplyr)
library(tidyr)
library(stringr)
df %>%
mutate(across(starts_with("Country"),
~ str_extract_all(.x, str_c(vals_to_find, collapse = "|")))) %>%
pivot_longer(cols = -ID, names_to = NULL,
values_to = 'Countries.found') %>%
unnest(Countries.found) %>%
distinct %>%
group_by(ID) %>%
summarise(Countries.found = toString(Countries.found))
-output
# A tibble: 4 × 2
ID Countries.found
<int> <chr>
1 1 Argentina, USA
2 2 Mexico
3 3 Argentina, USA
4 4 Mexico
I have some data that looks like this:
id
ethnicity
1
white
2
south asian
2
other
3
other
4
white
4
south asian
as seen above there is potential for an id to have two ethnicity values. How would I go about removing these 'other' rows if that id already has an entry such as "white" or "south asian" while keeping the "white" or "south asian" entry?
I have noticed there are entries which also have south asian along with a white entry
My priority would be South Asian > White > Other in terms of keeping rows
So an expected output would be
id
ethnicity
1
white
2
south asian
3
other
4
south asian
If the intention is to get the prioritized 'ethnicity' per 'id', convert the column 'ethnicity' to ordered with levels specified in the order of preference, then do a group by 'id' and filter the first available level in that order
library(dplyr)
df2 %>%
mutate(ethnicity = ordered(ethnicity,
c( "south asian", "white", "other"))) %>%
group_by(id) %>%
filter(ethnicity %in% first(levels(droplevels(ethnicity)))) %>%
ungroup
-output
# A tibble: 4 × 2
id ethnicity
<int> <ord>
1 1 white
2 2 south asian
3 3 other
4 4 south asian
data
df2 <- structure(list(id = c(1L, 2L, 2L, 3L, 4L, 4L), ethnicity = c("white",
"south asian", "other", "other", "white", "south asian")),
class = "data.frame", row.names = c(NA,
-6L))
I'm trying to write a function in R that generates three values that loops over each row of a df, checks the value of one column then, if it meets this condition, adds the value of the value of one column for this row to a value.
I thought that using the case_when construction would work best for this, but should I be using an lapply constructon instead ?
get_home_away_goals_for_team <- function(matches_df, team_list){
complete_df <- data.frame(team = character(), goals_scored= double(),goals_conceded = double(), games_played = double())
for (team in teams){
print(team)
goals_scored <- 0
goals_conceded <- 0
games_played <- 0
case_when(
matches_df$home == team ~ goals_scored = goals_scored + matches_df$hg,
matches_df$home == team ~ goals_conceded = goals_conceded + matches_df$ag,
matches_df$away == team ~ goals_scored = goals_scored + matches_df$ag,
matches_df$away == team ~ goals_conceded = goals_conceded + matches_df$hg,
matches_df$home == team ~ games_played = games_played + 1,
matches_df$away == team ~ games_played = games_played + 1)
temp_get_goals_df = data.frame(team,goals_scored,goals_conceded,games_played)
complete_df <- rbind(complete_df,temp_get_goals_df)
}
complete_df
}
The function takes a value of team, checks for each row whether this team was playing in a game home or away, then adds to the values of goals scored accordingly.
When I try to use the function though, I get the error Error: unexpected '}' in " }" which makes me think I'm using case_when incorrectly.
Is this the case ?
Data:
matches_df_example :
structure(list(home = c("Colorado Rapids", "Vancouver Whitecaps",
"DC United", "Los Angeles Galaxy", "San Jose Earthquakes", "FC Dallas"
), away = c("Columbus Crew", "Club de Foot Montreal", "Sporting Kansas City",
"Real Salt Lake", "New England Revolution", "New York Red Bulls"
), res = c("H", "H", "A", "A", "H", "H"), season = c(2012, 2012,
2012, 2012, 2012, 2012), hg = c(2, 2, 0, 1, 1, 2), ag = c(0,
0, 1, 3, 0, 1), date_time = structure(c(1331420400, 1331420400,
1331425800, 1331436600, 1331436600, 1331492400), tzone = "UTC", class = c("POSIXct",
"POSIXt")), home_conference = c("West", "West", "East", "West",
"West", "West"), away_conference = c("East", "East", "East",
"West", "East", "East")), row.names = c(NA, -6L), class = c("tbl_df",
"tbl", "data.frame"))
teams
c("Colorado Rapids", "Vancouver Whitecaps", "DC United", "Los Angeles Galaxy",
"San Jose Earthquakes", "FC Dallas", "Chivas USA", "Portland Timbers",
"Club de Foot Montreal", "Sporting Kansas City", "Real Salt Lake",
"Seattle Sounders", "Philadelphia Union", "Toronto FC", "Columbus Crew",
"New England Revolution", "Chicago Fire", "New York Red Bulls",
"Houston Dynamo", "Orlando City", "New York City", "Atlanta United",
"Minnesota United", "Los Angeles FC", "FC Cincinnati", "Atlanta Utd",
"Nashville SC", "Inter Miami", "Austin FC")
Perhaps this?
homes <- matches_df %>%
group_by(team = home) %>%
summarize(
goals_scored = sum(hg),
goals_conceded = sum(ag),
games_played = n(),
.groups = "drop"
)
aways <- matches_df %>%
group_by(team = away) %>%
summarize(
goals_scored = sum(hg),
goals_conceded = sum(ag),
games_played = n(),
.groups = "drop"
)
full_join(homes, aways, by = "team", suffix = c("", ".y")) %>%
full_join(tibble(team = teams), by = "team") %>%
transmute(
team,
goals_scored = coalesce(goals_scored, goals_scored.y, 0),
goals_conceded = coalesce(goals_conceded, goals_conceded.y, 0),
games_played = coalesce(games_played, games_played.y, 0)
)
# # A tibble: 29 x 4
# team goals_scored goals_conceded games_played
# <chr> <dbl> <dbl> <dbl>
# 1 Colorado Rapids 2 0 1
# 2 DC United 0 1 1
# 3 FC Dallas 2 1 1
# 4 Los Angeles Galaxy 1 3 1
# 5 San Jose Earthquakes 1 0 1
# 6 Vancouver Whitecaps 2 0 1
# 7 Club de Foot Montreal 2 0 1
# 8 Columbus Crew 2 0 1
# 9 New England Revolution 1 0 1
# 10 New York Red Bulls 2 1 1
# # ... with 19 more rows
I have a dataframe that in an entirely simplistic representation looks like this:
structure(list(Plant = c("rose", "rose", "rose", "rose", "rose",
"rose", "rose", "rose", "cactus", "cactus", "cactus", "cactus"
), Area = c("North", "North", "North", "North", "South", "South",
"South", "South", "South", "South", "South", "South"), dups = c(4L,
4L, 4L, 4L, 4L, 4L, 4L, 4L, 2L, 2L, 2L, 2L)), class = "data.frame", row.names = c(NA,
-12L))
For any row of plant, I want to replace it with specific possible combinations of plant and area that are within another data frame. They are not ALL possible combinations, but just the ones that truly exist.
The possible combinations of the variables look like this:
structure(list(nam = c("rose", "rose", "rose", "rose", "cactus",
"cactus"), area = c("North", "South", "East", "West", "South",
"Northwest")), class = "data.frame", row.names = c(NA, -6L))
The final dataset should look like:
structure(list(Plant2 = c("rose", "rose", "rose", "rose", "rose",
"rose", "rose", "rose", "cactus", "cactus", "cactus", "cactus"
), Area2 = c("North", "South", "East", "West", "North", "South",
"East", "West", "South", "Northwest", "South", "Northwest")), class = "data.frame", row.names = c(NA,
-12L))
This is how I started. I created a variable for how many combinations were potentially possible and added them to the datframe with a join. And then I got super stuck because try as I might, I can't change the Area variables properly. I thought I could basically paste all the combinations of the variables with the same dups value, but I can't call to the other dataframe from dplyr. This is a very simplistic version of the data, there are many other combinations and so it's not really want to do by subsetting the data, etc...
dups<-combos %>% group_by(nam) %>% mutate(dups=n())
colnames(dups)<-c("Plant","Area","dups")
df<-left_join(df,dups)
df<-df %>% uncount(dups, .remove=FALSE)
The information you have provided is not enough to produce a final dataframe like that since each combination of Plant and dups in df can be mapped to multiple values in combos. For instance, each "rose" and "4" could be matched against the first four rows in combos. However, it seems that you simply want Area2 to repeat itself until the values fill up all possible entries for each group of Plant and dups. If so, you can try
library(dplyr)
combos <- combos %>% group_by(nam) %>% mutate(dups = n())
df %>%
group_by(Plant, dups) %>%
mutate(Area2 = rep(
combos$area[combos$nam == Plant[[1L]] & combos$dups == dups[[1L]]],
length.out = n()
))
Output
# A tibble: 12 x 4
# Groups: Plant, dups [2]
Plant Area dups Area2
<chr> <chr> <int> <chr>
1 rose North 4 North
2 rose North 4 South
3 rose North 4 East
4 rose North 4 West
5 rose South 4 North
6 rose South 4 South
7 rose South 4 East
8 rose South 4 West
9 cactus South 2 South
10 cactus South 2 Northwest
11 cactus South 2 South
12 cactus South 2 Northwest
You can use expand.grid to create a dataframe with all possible conditions
expand.grid(name = unique(df$name), area = unique(df$area))
Plant Area
1 rose North
2 cactus North
3 rose South
4 cactus South
5 rose East
6 cactus East
7 rose West
8 cactus West
9 rose Northwest
10 cactus Northwest
This snippet should do what you want, if I've understood correctly. Here, d1 and d2 are your first and second data frames. I don't think that computing dups as you have is necessary for this task, but maybe I've misunderstood your intention.
library("dplyr")
l <- split(d2$area, d2$nam)
d1 %>%
group_by(Plant) %>%
mutate(Area = rep_len(l[[Plant[1L]]], n())) %>%
ungroup() %>%
select(-dups)
# A tibble: 12 × 2
Plant Area
<chr> <chr>
1 rose North
2 rose South
3 rose East
4 rose West
5 rose North
6 rose South
7 rose East
8 rose West
9 cactus South
10 cactus Northwest
11 cactus South
12 cactus Northwest
It seems to be a very simple flow control structure doubt, however I am having a hard time finding the correct syntax for this in R, I have tried numerous without success. I must be missing something really obvious.
I wanted to loop in a list with Brazilian states codes, and return the region it is in. My aim is to manipulate a larger data set, not a list, but here is a MWE using a list:
a <- c("RO", "AC", "AM" ,"RR", "PA", "AP", "TO", "MA", "PI", "CE", "RN", "PB", "PE", "AL", "SE", "BA", "MG", "ES", "RJ", "SP")
setregion <- function(uf) {
pb = txtProgressBar(min = 0, max = length(uf), initial = 0)
region_out<-list()
for (i in length(uf)) {
if (uf %in% c("RO" ,"AC" ,"AM" ,"RR", "PA" , "AP" , "TO")) {
region_out <- append(region_out,"North")
} else if ( uf %in% c("MA","PI","CE","RN","PB","PE","AL","SE","BA")) {
region_out <-append(region_out,"Northeast")
} else if ( uf %in% c("MG","ES","RJ","SP")){
region_out <- append(region_out,"Southeast")
} else if ( uf %in% c("PR", "SC", "RS")){
region_out <- append(region_out,"South")
} else if ( uf %in% c("MS","MT","GO", "DF")){
region_out <-append(region_out,"Midwest")
}
setTxtProgressBar(pb,i)
}
return(region_out)
}
setregion(a)
Upon running the above code, it seems the if loop breaks the for loop as well, and it only returns "North", which is the response to the very first item in the list.
I would expect a list with looking like:
"North", "North", "North" ,"North", "North", "North","North", "Northeast", "Northeast",...
What am I missing?
The problem with normal if-else is that it is not vectorized. You need a vectorized approach, such as the ifelse function. But, in your case, since you have so many conditions, the case_when function from the dplyr library might make more sense:
library(dplyr)
setregion <- function(uf) {
region_out <- case_when(
uf %in% c("RO","AC","AM","RR","PA","AP","TO") ~ "North",
uf %in% c("MA","PI","CE","RN","PB","PE","AL","SE","BA") ~ "Northeast",
uf %in% c("MG","ES","RJ","SP") ~ "Southeast",
uf %in% c("PR", "SC", "RS") ~ "South",
uf %in% c("MS","MT","GO", "DF") ~ "Midwest"
)
return(region_out)
}
The best approach is to avoid to hard code this mapping; rather, it's much better to have it on a file/table and let the code be independent on such mapping (which might change in a second moment).
Consider to build a table like that (I might have made mistakes in associating the correct region, but whatever):
ufToRegionMap <- structure(list(uf = c("RO", "AC", "AM", "RR", "PA", "AP", "TO",
"MA", "PI", "CE", "RN", "PB", "PE", "AL", "SE", "BA", "MG", "ES",
"RJ", "SP", "PR", "SC", "RS", "MS", "MT", "GO", "DF"), region = c("North",
"North", "North", "North", "North", "North", "North", "Northeast",
"Northeast", "Northeast", "Northeast", "Northeast", "Northeast",
"Northeast", "Northeast", "Northeast", "Southeast", "Southeast",
"Southeast", "Southeast", "South", "South", "South", "Midwest",
"Midwest", "Midwest", "Midwest")), class = "data.frame", row.names = c(NA,
-27L))
Then, you can define simply your function as such:
setregion <- function(uf, ufToRegionMap) {
ufToRegionMap$region[match(uf,ufToRegionMap$uf)]
}
avoiding all the if-else headache and having a code that is naturally vectorized. Furthermore, if you want to change and create another region/association, you just change the ufToRegionMap with no need of changing the setregion function.
If you don't like case_when() you could use within() and simple conditional assignment in your function.
regionizer <- function(dat, a) within(dat, {
region_out[a %in% c("RO" ,"AC" ,"AM" ,"RR", "PA" , "AP" , "TO")] <- "North"
region_out[a %in% c("MA","PI","CE","RN","PB","PE","AL","SE","BA")] <- "Northeast"
region_out[a %in% c("MG","ES","RJ","SP")] <- "Southeast"
region_out[a %in% c("PR", "SC", "RS")] <- "South"
region_out[a %in% c("MS","MT","GO", "DF")] <- "Midwest"
})
regionizer(dat, a)
# a x region_out
# 1 RO 0.15983063 North
# 2 AC -0.24371961 North
# 3 AM -0.52700098 North
# 4 RR 0.38777302 North
# 5 PA 0.91111258 North
# 6 AP -1.31696659 North
# 7 TO -0.16136374 North
# 8 MA -0.85951191 Northeast
# 9 PI 0.13187218 Northeast
# 10 CE -1.62908394 Northeast
...
Data: dat <- data.frame(a, x=rnorm(length(a)))
Alternatively, this can be solved by merging / joining with a look-up table lut.
a <- c("RO", "AC", "AM" ,"RR", "PA", "AP", "TO", "MA", "PI", "CE", "RN", "PB", "PE", "AL", "SE", "BA", "MG", "ES", "RJ", "SP")
library(data.table)
library(magrittr)
# create look-up table from code snippets supplied by OP
lut <- list(
North = c("RO" ,"AC" ,"AM" ,"RR", "PA" , "AP" , "TO"),
Northeast = c("MA","PI","CE","RN","PB","PE","AL","SE","BA"),
Southeast = c("MG","ES","RJ","SP"),
South = c("PR", "SC", "RS"),
Midwest = c("MS","MT","GO", "DF")
) %>%
lapply(as.data.table) %>%
rbindlist(idcol = "region")
# update join
as.data.table(a)[lut, on = .(a == V1), region_out := region][]
a region_out
1: RO North
2: AC North
3: AM North
4: RR North
5: PA North
6: AP North
7: TO North
8: MA Northeast
9: PI Northeast
10: CE Northeast
11: RN Northeast
12: PB Northeast
13: PE Northeast
14: AL Northeast
15: SE Northeast
16: BA Northeast
17: MG Southeast
18: ES Southeast
19: RJ Southeast
20: SP Southeast
The look-up table was constructed from the code snippets provided by the OP:
region V1
1: North RO
2: North AC
3: North AM
4: North RR
5: North PA
6: North AP
7: North TO
8: Northeast MA
9: Northeast PI
10: Northeast CE
11: Northeast RN
12: Northeast PB
13: Northeast PE
14: Northeast AL
15: Northeast SE
16: Northeast BA
17: Southeast MG
18: Southeast ES
19: Southeast RJ
20: Southeast SP
21: South PR
22: South SC
23: South RS
24: Midwest MS
25: Midwest MT
26: Midwest GO
27: Midwest DF
region V1