I have two rather large data files which I need to merge into one, in the following way:
A <- tibble(
id=1:2,
firstName=c("Alice", "Bob")
)
B <- tibble(
id=c(1,1,2),
email=c("alice#wonder.land.com", "alice2#wonderland.com", "bob#builder.com")
)
desiredResult <- tibble(
id=1:2,
firstName=c("Alice", "Bob"),
email1=c("alice#wonderland.com", "bob#builder.com"),
email2=c("alice2#wonderland.com", NA)
)
How can this be done efficiently? I tried using spread() but did not succeed and could only hack together a bad solution:
notGood <-
inner_join(A, B, by = "id") %>%
split(., .$id) %>%
map_dfr(function(x) as.tibble(t(unlist(x)))) %>%
replace(is.na(.), "") %>%
unite(id, id1, id, sep = "") %>%
unite(firstName, firstName1, firstName, sep = "") %>%
unite(email, email1, email, sep = "") %>%
select(id, firstName, matches("email"))
EDIT:
The suggested solutions work great, but how could I apply them to more than one column? Like in this example:
A <- tibble(
id=1:2,
firstName=c("Alice", "Bob")
)
B <- tibble(
id=c(1,1,2),
email=c("alice#wonder.land.com", "alice2#wonderland.com", "bob#builder.com"),
phone=c("123", "456", "789")
)
desiredResult <- tibble(
id=1:2,
firstName=c("Alice", "Bob"),
email1=c("alice#wonderland.com", "bob#builder.com"),
email2=c("alice2#wonderland.com", NA),
phone1=c("123", "789"),
phone2=c("456", NA)
)
Simply adding more column names to the suggested answers doesn't quite work:
A %>%
left_join(B, by='id') %>%
group_by(id)%>%
mutate(rn=paste0('email',row_number())) %>%
mutate(rn2=paste0('phone',row_number())) %>%
spread(rn, email) %>%
spread(rn2, phone)
Check this solution:
B %>%
group_by(id) %>%
mutate(rn = paste0('email', row_number())) %>%
spread(rn, email) %>%
right_join(A) %>%
select(id, firstName, everything())
Answer to added question:
A %>%
left_join(
B %>%
gather(key, val, -id) %>%
group_by(id, key) %>%
mutate(key2 = paste0(key, row_number())) %>%
ungroup() %>%
select(-key) %>%
spread(key2, val)
)
desiredResult <-
A %>%
inner_join(B %>%
group_by(id) %>%
mutate(ColName = paste0("email",row_number())) %>%
ungroup() %>%
spread(ColName, email), by = "id")
Related
I have a data.frame with the following structure:
What I need is that in case that a value in the first column occures more than once, all corresponding entries in column V18 are concluded in one cell.
I applied the folling code.
p <- function(v) {
Reduce(f=paste0, x = v)
}
Data %>%
group_by(V1) %>%
summarise(test = p(as.character(V18))) %>%
merge(., M_TEST, by = 'V1') %>%
select(V1, V18, test)
It gives:
What I need is that instead of 4344, it is {43,44}.
How can I do this?
Thank you really much for your help!
Sincerely
Try This:
Data %>%
group_by(V1) %>%
summarise(test = p(as.character(V18))) %>%
merge(., M_TEST, by = 'V1') %>%
select(V1, V18, test) %>%
mutate(test = str_remove_all(test, pattern = "NA")) %>%
mutate(test = formatC(as.numeric(test), big.mark=",", big.interval = 2L)) %>%
mutate(test = paste0("{", test, "}"))
Edit: For Multiple Columns, this should work:
Data %>%
group_by(V1) %>%
summarise_at(vars(V2:V18), paste0, collapse="") %>%
mutate_at(vars(V2:V18), str_remove_all, pattern = "NA") %>%
mutate_at(vars(V2:V18), as.numeric) %>%
mutate_at(vars(V2:V18), formatC, big.mark=",", big.interval = 2L)
Does anybody know why am I getting this error here? If yes, How can I resolve this issue?
This code is working fine on my sample dataset but if I apply this on my whole dataset, I'm getting the above error.
library(dplyr)
library(tidyr)
library(stringr)
library(bindrcpp)
df2<-read.csv("filepath.csv", header=TRUE, sep=",")
df2 %>%
mutate(Body_text = sub("^.*Agent\\s\\w+", "", Body_text)) %>%
separate_rows(Body_text, sep="\\s(?=\\w+:)") %>%
separate(Body_text, into = c("Text_from", "value"), sep=":\\s?") %>%
na.omit %>%
group_by(Id, newgrp = str_c('Text_from_',
replace(Text_from, Text_from != "Customer", "Agent"))) %>%
mutate(rn = row_number()) %>%
pivot_wider(names_from = newgrp, values_from = value) %>%
group_by(Id) %>%
summarise(AgentName = first(Text_from),
Text_from_Agent = str_c(na.omit(Text_from_Agent), collapse=' '),
Text_from_Customer = str_c(na.omit(Text_from_Customer), collapse = ' '))
The following image is my sample dataset.
Can anyone fix this issue? Any help is appreciated.
Thanks!
We can make a condition to check if all the elements columns for a particular group are NA, then return NA or else paste the non-NA elements in str_c
library(stringr)
library(dplyr)
library(tidyr)
df2 %>%
mutate(Body_text = sub("^.*Agent\\s\\w+", "", Body_text)) %>%
separate_rows(Body_text, sep="\\s(?=\\w+:)") %>%
separate(Body_text, into = c("Text_from", "value"), sep=":\\s?") %>%
na.omit %>%
group_by(Id, newgrp = str_c('Text_from_',
replace(Text_from, Text_from != "Customer", "Agent"))) %>%
mutate(rn = row_number()) %>%
pivot_wider(names_from = newgrp, values_from = value) %>%
group_by(Id) %>%
summarise(AgentName = first(Text_from),
Text_from_Agent = = if(all(is.na(Text_from_Agent))) NA_character_ else str_c(Text_from_Agent[!is.na(Text_from_Agent)]), collapse=' '),
Text_from_Customer = if(all(is.na( Text_from_Customer))) NA_character_ else str_c( Text_from_Customer[!is.na( Text_from_Customer)]), collapse=' '))
library(tidyverse)
library(stringr)
library(janitor)
word_count <- function(data, char_col) {
char_col <- enquo(char_col)
data %>%
select(!!char_col) %>%
mutate(char_col = str_remove_all(!!char_col, '[[:punct:]]')) %>%
mutate(char_col = str_split(!!char_col, ' ')) %>%
separate(char_col, into = paste0('col', 1:30), fill = 'right') %>%
select(-col1) %>%
gather(value = word) %>%
select(word) %>%
remove_empty(c('rows')) %>%
filter(word != '') %>%
mutate(word = str_to_lower(word)) %>%
group_by(word) %>%
summarize(freq = n()) %>%
arrange(desc(freq))
}
iris %>%
as.tibble() %>%
mutate(Species = str_c(Species, ' species')) %>%
word_count(Species)
This code works as intended outside of a function, but when I use it inside of a function it will return the frequencies of each word and of each 'non-split' string.
I assume this is a problem with how I'm placing the '!!' operators, but I'm not able to solve this by trial and error placement with them. It may also be a lazyeval issue, which I'm not certain how to solve.
I want the output of the function to match the output of the code below.
iris %>%
as.tibble() %>%
mutate(Species = str_c(Species, ' species')) %>%
select(Species) %>%
mutate(Species = str_remove_all(Species, '[[:punct:]]')) %>%
mutate(Species = str_split(Species, ' ')) %>%
separate(Species, into = paste0('col', 1:30), fill = 'right') %>%
select(-col1) %>%
gather(value = word) %>%
select(word) %>%
remove_empty(c('rows')) %>%
filter(word != '') %>%
mutate(word = str_to_lower(word)) %>%
group_by(word) %>%
summarize(freq = n()) %>%
arrange(desc(freq))
We cannot assign a char_col object as column name to mutate. It needs to be evaluated. The char_col is a quosure object which can be converted to character (quo_name(char_col)) or symbol that when evaluated (!!) will assign (:=) the correct column name
word_count <- function(data, char_col) {
char_col <- enquo(char_col)
char_colC <- quo_name(char_col)
data %>%
select(!!char_col) %>%
mutate(!!char_colC := str_remove_all(!!char_col, '[[:punct:]]')) %>%
mutate(!!char_colC := str_split(!!char_col, ' ')) %>%
separate(char_colC, into = paste0('col', 1:30), fill = 'right') %>%
select(-col1) %>%
gather(value = word) %>%
select(word) %>%
remove_empty(c('rows')) %>%
filter(word != '') %>%
mutate(word = str_to_lower(word)) %>%
group_by(word) %>%
summarize(freq = n()) %>%
arrange(desc(freq))
}
out2 <- iris %>%
as.tibble() %>%
mutate(Species =str_c(Species, ' species')) %>%
word_count(Species)
-checking the output without using the function
out1 <- iris %>%
as.tibble() %>%
mutate(Species = str_c(Species, ' species')) %>%
select(Species) %>%
mutate(Species = str_remove_all(Species, '[[:punct:]]')) %>%
mutate(Species = str_split(Species, ' ')) %>%
separate(Species, into = paste0('col', 1:30), fill = 'right') %>%
select(-col1) %>%
gather(value = word) %>%
select(word) %>%
remove_empty(c('rows')) %>%
filter(word != '') %>%
mutate(word = str_to_lower(word)) %>%
group_by(word) %>%
summarize(freq = n()) %>%
arrange(desc(freq))
identical(out1, out2)
#[1] TRUE
I have generated this summary table based on the df below.
set.seed(1)
df <- data.frame(rep(
sample(c(2012,2016),10, replace = T)),
sample(c('Treat','Control'),10,replace = T),
runif(10,0,1),
runif(10,0,1),
runif(10,0,1))
colnames(df) <- c('Year','Group','V1','V2','V3')
summary.table = df %>%
group_by(Year, Group) %>%
group_by(N = n(), add = TRUE) %>%
summarise_all(funs(sd,median)) %>%
ungroup %>%
mutate(Year = ifelse(duplicated(Year),"",Year))
Is there a way I could display the values related to the median columns as percentages?
I did not know how to use mutate() and scales::percent() for only a subset of columns (I dont want to do it individually, since there will be more columns in the original dataset, making this procedure not practical enough.
What should I have done instead if I wanted to mutate according to a subset of rows?
Thank you
EDIT:
And if it was like this?
summary.table = df %>%
group_by(Year, Group) %>%
summarise_all(funs(median,sd)) %>%
gather(key, value, -Year, -Group) %>%
separate(key, into=c("var", "stat")) %>%
unite(stat_Group, stat, Group) %>%
spread(stat_Group, value) %>%
ungroup %>%
mutate(Year = ifelse(duplicated(Year),"",Year))
We need to use the percent wrapped on median
summary.table <- df %>%
group_by(Year, Group) %>%
group_by(N = n(), add = TRUE) %>%
summarise_all(funs(sd=sd(.),median=scales::percent(median(.)))) %>%
ungroup %>%
mutate(Year = ifelse(duplicated(Year),"",Year))
I am learning to get, cleaning and combining data. I am confused why in a loop rbind command result in returning 10 data instead of expected 30 data as when I combine it manually (i by i).
library(XML)
mergeal <- NULL
tabnums <- 3
for (i in 1:length(tabnums)) {
bnn <- paste0("http://www.ngchanmau.com/listing_browse.php?cur_page=",
tabnums[i], "&&coming=22-Oct-2015&coming=22-Oct-2015")
tem <- readHTMLTable(bnn, header=T, stringsAsFactors=F)
#data cleaning
ff <- tem[8] #wanted data
ff1 <- as.data.frame(ff)
ff2 <- ff1[ , 1] #get 1st col data only
ff3 <- unique(ff2)
ff4 <- ff3[c(2,5:13)] #wanted list only
#merging dataset
mergeal <- rbind(mergeal, ff4)
}
I've tried using list rbind list of data frames with one column of characters and numerics but still have the same result as above. Appreciate any help on what I missed, thanks.
I cleaned up the data cause I was bored.
library(plyr)
library(XML)
library(dplyr)
library(magrittr)
library(stringi)
library(tidyr)
library(lubridate)
answer =
data_frame(tabnums = 1:3) %>%
group_by(tabnums) %>%
do(.$tabnums %>%
paste0("http://www.ngchanmau.com/listing_browse.php?cur_page=",
., "&&coming=22-Oct-2015&coming=22-Oct-2015") %>%
readHTMLTable(header = T, stringsAsFactors = F) %>%
extract2(8)) %>%
ungroup %>%
select(V1) %>%
distinct %>%
mutate(V1 =
V1 %>%
stri_replace_all_fixed("Â", "\n") %>%
stri_replace_all_fixed("Type:", "\nType:") %>%
stri_replace_all_fixed("Time:", "\nTime:") %>%
stri_replace_all_fixed("Area:", "\nArea:") %>%
stri_split_fixed("\n")) %>%
unnest(V1) %>%
mutate(V1 = V1 %>% stri_trim) %>%
filter(V1 %>% stri_detect_regex("^There are currently") %>% `!`) %>%
filter(V1 != "") %>%
separate(V1, c("variable", "value"), sep = ":", fill = "left") %>%
mutate(variable = variable %>% mapvalues(NA, "Description"),
ID = variable %>% `==`("Description") %>% cumsum) %>%
spread(variable, value) %>%
mutate(Area = Area %>% extract_numeric,
Price = Price %>% extract_numeric,
Datetime =
Time %>%
stri_replace_all_fixed("a.m.", "am") %>%
stri_replace_all_fixed("p.m.", "pm") %>%
paste(Date, .) %>%
dmy_hm) %>%
select(-Date, -Time)