Merging and transforming two sets of data

Merging and transforming two sets of data - r

I have two rather large data files which I need to merge into one, in the following way:
A <- tibble(
id=1:2,
firstName=c("Alice", "Bob")
)
B <- tibble(
id=c(1,1,2),
email=c("alice#wonder.land.com", "alice2#wonderland.com", "bob#builder.com")
)
desiredResult <- tibble(
id=1:2,
firstName=c("Alice", "Bob"),
email1=c("alice#wonderland.com", "bob#builder.com"),
email2=c("alice2#wonderland.com", NA)
)
How can this be done efficiently? I tried using spread() but did not succeed and could only hack together a bad solution:
notGood <-
inner_join(A, B, by = "id") %>%
split(., .$id) %>%
map_dfr(function(x) as.tibble(t(unlist(x)))) %>%
replace(is.na(.), "") %>%
unite(id, id1, id, sep = "") %>%
unite(firstName, firstName1, firstName, sep = "") %>%
unite(email, email1, email, sep = "") %>%
select(id, firstName, matches("email"))
EDIT:
The suggested solutions work great, but how could I apply them to more than one column? Like in this example:
A <- tibble(
id=1:2,
firstName=c("Alice", "Bob")
)
B <- tibble(
id=c(1,1,2),
email=c("alice#wonder.land.com", "alice2#wonderland.com", "bob#builder.com"),
phone=c("123", "456", "789")
)
desiredResult <- tibble(
id=1:2,
firstName=c("Alice", "Bob"),
email1=c("alice#wonderland.com", "bob#builder.com"),
email2=c("alice2#wonderland.com", NA),
phone1=c("123", "789"),
phone2=c("456", NA)
)
Simply adding more column names to the suggested answers doesn't quite work:
A %>%
left_join(B, by='id') %>%
group_by(id)%>%
mutate(rn=paste0('email',row_number())) %>%
mutate(rn2=paste0('phone',row_number())) %>%
spread(rn, email) %>%
spread(rn2, phone)

Check this solution:
B %>%
group_by(id) %>%
mutate(rn = paste0('email', row_number())) %>%
spread(rn, email) %>%
right_join(A) %>%
select(id, firstName, everything())
Answer to added question:
A %>%
left_join(
B %>%
gather(key, val, -id) %>%
group_by(id, key) %>%
mutate(key2 = paste0(key, row_number())) %>%
ungroup() %>%
select(-key) %>%
spread(key2, val)
)

desiredResult <-
A %>%
inner_join(B %>%
group_by(id) %>%
mutate(ColName = paste0("email",row_number())) %>%
ungroup() %>%
spread(ColName, email), by = "id")

Related

Formatting of Data Frames in R

I have a data.frame with the following structure:
What I need is that in case that a value in the first column occures more than once, all corresponding entries in column V18 are concluded in one cell.
I applied the folling code.
p <- function(v) {
Reduce(f=paste0, x = v)
}
Data %>%
group_by(V1) %>%
summarise(test = p(as.character(V18))) %>%
merge(., M_TEST, by = 'V1') %>%
select(V1, V18, test)
It gives:
What I need is that instead of 4344, it is {43,44}.
How can I do this?
Thank you really much for your help!
Sincerely

Try This:
Data %>%
group_by(V1) %>%
summarise(test = p(as.character(V18))) %>%
merge(., M_TEST, by = 'V1') %>%
select(V1, V18, test) %>%
mutate(test = str_remove_all(test, pattern = "NA")) %>%
mutate(test = formatC(as.numeric(test), big.mark=",", big.interval = 2L)) %>%
mutate(test = paste0("{", test, "}"))
Edit: For Multiple Columns, this should work:
Data %>%
group_by(V1) %>%
summarise_at(vars(V2:V18), paste0, collapse="") %>%
mutate_at(vars(V2:V18), str_remove_all, pattern = "NA") %>%
mutate_at(vars(V2:V18), as.numeric) %>%
mutate_at(vars(V2:V18), formatC, big.mark=",", big.interval = 2L)

Error in summarise_impl(.data, dots) : Column Text_from_Customer must be length 1 (a summary value), not 0

Does anybody know why am I getting this error here? If yes, How can I resolve this issue?
This code is working fine on my sample dataset but if I apply this on my whole dataset, I'm getting the above error.
library(dplyr)
library(tidyr)
library(stringr)
library(bindrcpp)
df2<-read.csv("filepath.csv", header=TRUE, sep=",")
df2 %>%
mutate(Body_text = sub("^.*Agent\\s\\w+", "", Body_text)) %>%
separate_rows(Body_text, sep="\\s(?=\\w+:)") %>%
separate(Body_text, into = c("Text_from", "value"), sep=":\\s?") %>%
na.omit %>%
group_by(Id, newgrp = str_c('Text_from_',
replace(Text_from, Text_from != "Customer", "Agent"))) %>%
mutate(rn = row_number()) %>%
pivot_wider(names_from = newgrp, values_from = value) %>%
group_by(Id) %>%
summarise(AgentName = first(Text_from),
Text_from_Agent = str_c(na.omit(Text_from_Agent), collapse=' '),
Text_from_Customer = str_c(na.omit(Text_from_Customer), collapse = ' '))
The following image is my sample dataset.
Can anyone fix this issue? Any help is appreciated.
Thanks!

We can make a condition to check if all the elements columns for a particular group are NA, then return NA or else paste the non-NA elements in str_c
library(stringr)
library(dplyr)
library(tidyr)
df2 %>%
mutate(Body_text = sub("^.*Agent\\s\\w+", "", Body_text)) %>%
separate_rows(Body_text, sep="\\s(?=\\w+:)") %>%
separate(Body_text, into = c("Text_from", "value"), sep=":\\s?") %>%
na.omit %>%
group_by(Id, newgrp = str_c('Text_from_',
replace(Text_from, Text_from != "Customer", "Agent"))) %>%
mutate(rn = row_number()) %>%
pivot_wider(names_from = newgrp, values_from = value) %>%
group_by(Id) %>%
summarise(AgentName = first(Text_from),
Text_from_Agent = = if(all(is.na(Text_from_Agent))) NA_character_ else str_c(Text_from_Agent[!is.na(Text_from_Agent)]), collapse=' '),
Text_from_Customer = if(all(is.na( Text_from_Customer))) NA_character_ else str_c( Text_from_Customer[!is.na( Text_from_Customer)]), collapse=' '))

Writing pipeable function in dplyr syntax, using enquo(), not returning intended output

library(tidyverse)
library(stringr)
library(janitor)
word_count <- function(data, char_col) {
char_col <- enquo(char_col)
data %>%
select(!!char_col) %>%
mutate(char_col = str_remove_all(!!char_col, '[[:punct:]]')) %>%
mutate(char_col = str_split(!!char_col, ' ')) %>%
separate(char_col, into = paste0('col', 1:30), fill = 'right') %>%
select(-col1) %>%
gather(value = word) %>%
select(word) %>%
remove_empty(c('rows')) %>%
filter(word != '') %>%
mutate(word = str_to_lower(word)) %>%
group_by(word) %>%
summarize(freq = n()) %>%
arrange(desc(freq))
}
iris %>%
as.tibble() %>%
mutate(Species = str_c(Species, ' species')) %>%
word_count(Species)
This code works as intended outside of a function, but when I use it inside of a function it will return the frequencies of each word and of each 'non-split' string.
I assume this is a problem with how I'm placing the '!!' operators, but I'm not able to solve this by trial and error placement with them. It may also be a lazyeval issue, which I'm not certain how to solve.
I want the output of the function to match the output of the code below.
iris %>%
as.tibble() %>%
mutate(Species = str_c(Species, ' species')) %>%
select(Species) %>%
mutate(Species = str_remove_all(Species, '[[:punct:]]')) %>%
mutate(Species = str_split(Species, ' ')) %>%
separate(Species, into = paste0('col', 1:30), fill = 'right') %>%
select(-col1) %>%
gather(value = word) %>%
select(word) %>%
remove_empty(c('rows')) %>%
filter(word != '') %>%
mutate(word = str_to_lower(word)) %>%
group_by(word) %>%
summarize(freq = n()) %>%
arrange(desc(freq))

We cannot assign a char_col object as column name to mutate. It needs to be evaluated. The char_col is a quosure object which can be converted to character (quo_name(char_col)) or symbol that when evaluated (!!) will assign (:=) the correct column name
word_count <- function(data, char_col) {
char_col <- enquo(char_col)
char_colC <- quo_name(char_col)
data %>%
select(!!char_col) %>%
mutate(!!char_colC := str_remove_all(!!char_col, '[[:punct:]]')) %>%
mutate(!!char_colC := str_split(!!char_col, ' ')) %>%
separate(char_colC, into = paste0('col', 1:30), fill = 'right') %>%
select(-col1) %>%
gather(value = word) %>%
select(word) %>%
remove_empty(c('rows')) %>%
filter(word != '') %>%
mutate(word = str_to_lower(word)) %>%
group_by(word) %>%
summarize(freq = n()) %>%
arrange(desc(freq))
}
out2 <- iris %>%
as.tibble() %>%
mutate(Species =str_c(Species, ' species')) %>%
word_count(Species)
-checking the output without using the function
out1 <- iris %>%
as.tibble() %>%
mutate(Species = str_c(Species, ' species')) %>%
select(Species) %>%
mutate(Species = str_remove_all(Species, '[[:punct:]]')) %>%
mutate(Species = str_split(Species, ' ')) %>%
separate(Species, into = paste0('col', 1:30), fill = 'right') %>%
select(-col1) %>%
gather(value = word) %>%
select(word) %>%
remove_empty(c('rows')) %>%
filter(word != '') %>%
mutate(word = str_to_lower(word)) %>%
group_by(word) %>%
summarize(freq = n()) %>%
arrange(desc(freq))
identical(out1, out2)
#[1] TRUE

Mutating values of subset of columns into percentage format

I have generated this summary table based on the df below.
set.seed(1)
df <- data.frame(rep(
sample(c(2012,2016),10, replace = T)),
sample(c('Treat','Control'),10,replace = T),
runif(10,0,1),
runif(10,0,1),
runif(10,0,1))
colnames(df) <- c('Year','Group','V1','V2','V3')
summary.table = df %>%
group_by(Year, Group) %>%
group_by(N = n(), add = TRUE) %>%
summarise_all(funs(sd,median)) %>%
ungroup %>%
mutate(Year = ifelse(duplicated(Year),"",Year))
Is there a way I could display the values related to the median columns as percentages?
I did not know how to use mutate() and scales::percent() for only a subset of columns (I dont want to do it individually, since there will be more columns in the original dataset, making this procedure not practical enough.
What should I have done instead if I wanted to mutate according to a subset of rows?
Thank you
EDIT:
And if it was like this?
summary.table = df %>%
group_by(Year, Group) %>%
summarise_all(funs(median,sd)) %>%
gather(key, value, -Year, -Group) %>%
separate(key, into=c("var", "stat")) %>%
unite(stat_Group, stat, Group) %>%
spread(stat_Group, value) %>%
ungroup %>%
mutate(Year = ifelse(duplicated(Year),"",Year))

We need to use the percent wrapped on median
summary.table <- df %>%
group_by(Year, Group) %>%
group_by(N = n(), add = TRUE) %>%
summarise_all(funs(sd=sd(.),median=scales::percent(median(.)))) %>%
ungroup %>%
mutate(Year = ifelse(duplicated(Year),"",Year))

Combine list of data frames with one column of characters

I am learning to get, cleaning and combining data. I am confused why in a loop rbind command result in returning 10 data instead of expected 30 data as when I combine it manually (i by i).
library(XML)
mergeal <- NULL
tabnums <- 3
for (i in 1:length(tabnums)) {
bnn <- paste0("http://www.ngchanmau.com/listing_browse.php?cur_page=",
tabnums[i], "&&coming=22-Oct-2015&coming=22-Oct-2015")
tem <- readHTMLTable(bnn, header=T, stringsAsFactors=F)
#data cleaning
ff <- tem[8] #wanted data
ff1 <- as.data.frame(ff)
ff2 <- ff1[ , 1] #get 1st col data only
ff3 <- unique(ff2)
ff4 <- ff3[c(2,5:13)] #wanted list only
#merging dataset
mergeal <- rbind(mergeal, ff4)
}
I've tried using list rbind list of data frames with one column of characters and numerics but still have the same result as above. Appreciate any help on what I missed, thanks.

I cleaned up the data cause I was bored.
library(plyr)
library(XML)
library(dplyr)
library(magrittr)
library(stringi)
library(tidyr)
library(lubridate)
answer =
data_frame(tabnums = 1:3) %>%
group_by(tabnums) %>%
do(.$tabnums %>%
paste0("http://www.ngchanmau.com/listing_browse.php?cur_page=",
., "&&coming=22-Oct-2015&coming=22-Oct-2015") %>%
readHTMLTable(header = T, stringsAsFactors = F) %>%
extract2(8)) %>%
ungroup %>%
select(V1) %>%
distinct %>%
mutate(V1 =
V1 %>%
stri_replace_all_fixed("Â", "\n") %>%
stri_replace_all_fixed("Type:", "\nType:") %>%
stri_replace_all_fixed("Time:", "\nTime:") %>%
stri_replace_all_fixed("Area:", "\nArea:") %>%
stri_split_fixed("\n")) %>%
unnest(V1) %>%
mutate(V1 = V1 %>% stri_trim) %>%
filter(V1 %>% stri_detect_regex("^There are currently") %>% `!`) %>%
filter(V1 != "") %>%
separate(V1, c("variable", "value"), sep = ":", fill = "left") %>%
mutate(variable = variable %>% mapvalues(NA, "Description"),
ID = variable %>% `==`("Description") %>% cumsum) %>%
spread(variable, value) %>%
mutate(Area = Area %>% extract_numeric,
Price = Price %>% extract_numeric,
Datetime =
Time %>%
stri_replace_all_fixed("a.m.", "am") %>%
stri_replace_all_fixed("p.m.", "pm") %>%
paste(Date, .) %>%
dmy_hm) %>%
select(-Date, -Time)

Develop Reference

r css asp.net wordpress firebase qt symfony nginx http apache-flex

Merging and transforming two sets of data - r

desiredResult <- A %>% inner_join(B %>% group_by(id) %>% mutate(ColName = paste0("email",row_number())) %>% ungroup() %>% spread(ColName, email), by = "id")

Related

Formatting of Data Frames in R

Error in summarise_impl(.data, dots) : Column Text_from_Customer must be length 1 (a summary value), not 0

Writing pipeable function in dplyr syntax, using enquo(), not returning intended output

Mutating values of subset of columns into percentage format

Combine list of data frames with one column of characters

Categories

Resources