Formatting of Data Frames in R - r

I have a data.frame with the following structure:
What I need is that in case that a value in the first column occures more than once, all corresponding entries in column V18 are concluded in one cell.
I applied the folling code.
p <- function(v) {
Reduce(f=paste0, x = v)
}
Data %>%
group_by(V1) %>%
summarise(test = p(as.character(V18))) %>%
merge(., M_TEST, by = 'V1') %>%
select(V1, V18, test)
It gives:
What I need is that instead of 4344, it is {43,44}.
How can I do this?
Thank you really much for your help!
Sincerely

Try This:
Data %>%
group_by(V1) %>%
summarise(test = p(as.character(V18))) %>%
merge(., M_TEST, by = 'V1') %>%
select(V1, V18, test) %>%
mutate(test = str_remove_all(test, pattern = "NA")) %>%
mutate(test = formatC(as.numeric(test), big.mark=",", big.interval = 2L)) %>%
mutate(test = paste0("{", test, "}"))
Edit: For Multiple Columns, this should work:
Data %>%
group_by(V1) %>%
summarise_at(vars(V2:V18), paste0, collapse="") %>%
mutate_at(vars(V2:V18), str_remove_all, pattern = "NA") %>%
mutate_at(vars(V2:V18), as.numeric) %>%
mutate_at(vars(V2:V18), formatC, big.mark=",", big.interval = 2L)

Related

Automate script by changing str_detect?

I am using this script to produce a table. In the second line, for collectionName, I am using "Organization X". I have many different organizations (Org Y, Org Z, .....) to create this table for. Is there a way to automate this? And to automate the naming of the object (currently "orgx" below)?
orgx <- df %>%
filter(str_detect(collectionName, c("Organization X"))) %>%
filter(str_detect(Year, paste(years, collapse = "|"))) %>%
corpus(text_field = "text") %>%
tokens(remove_punct = TRUE) %>%
tokens_select(stopwords('english'),selection='remove') %>%
tokens_tolower(keep_acronyms = FALSE) %>%
tokens_lookup(dictionary = dict, nomatch = TRUE) %>%
dfm() %>%
dfm_group(groups = "Title") %>%
dfm_weight(scheme = "prop") %>%
as.data.frame() %>%
mutate_at(vars(keyterms, true), funs(round(., 4)))
Get the column names specific to that organizations as vector, use that as pattern in str_detect by looping over the vector in map and return the output in a list
library(dplyr)
library(purrr)
library(stringr)
vec <- c("Organization X", "Organization Y")
out <- map(vec, ~
df %>%
filter(str_detect(collectionName, .x)) %>%
filter(str_detect(Year, paste(years, collapse = "|"))) %>%
corpus(text_field = "text") %>%
tokens(remove_punct = TRUE) %>%
tokens_select(stopwords('english'),selection='remove') %>%
tokens_tolower(keep_acronyms = FALSE) %>%
tokens_lookup(dictionary = dict, nomatch = TRUE) %>%
dfm() %>%
dfm_group(groups = "Title") %>%
dfm_weight(scheme = "prop") %>%
as.data.frame() %>%
mutate_at(vars(keyterms, true), funs(round(., 4)))
)
names(out) <- sub("^(...).*\\s+(\\S)$", "\\1\\2", vec)
It may be better to keep the output in a list. But, if we need to assign it to different objects, it can be done with list2env or assign
list2env(out, .GlobalEnv)

Error in summarise_impl(.data, dots) : Column Text_from_Customer must be length 1 (a summary value), not 0

Does anybody know why am I getting this error here? If yes, How can I resolve this issue?
This code is working fine on my sample dataset but if I apply this on my whole dataset, I'm getting the above error.
library(dplyr)
library(tidyr)
library(stringr)
library(bindrcpp)
df2<-read.csv("filepath.csv", header=TRUE, sep=",")
df2 %>%
mutate(Body_text = sub("^.*Agent\\s\\w+", "", Body_text)) %>%
separate_rows(Body_text, sep="\\s(?=\\w+:)") %>%
separate(Body_text, into = c("Text_from", "value"), sep=":\\s?") %>%
na.omit %>%
group_by(Id, newgrp = str_c('Text_from_',
replace(Text_from, Text_from != "Customer", "Agent"))) %>%
mutate(rn = row_number()) %>%
pivot_wider(names_from = newgrp, values_from = value) %>%
group_by(Id) %>%
summarise(AgentName = first(Text_from),
Text_from_Agent = str_c(na.omit(Text_from_Agent), collapse=' '),
Text_from_Customer = str_c(na.omit(Text_from_Customer), collapse = ' '))
The following image is my sample dataset.
Can anyone fix this issue? Any help is appreciated.
Thanks!
We can make a condition to check if all the elements columns for a particular group are NA, then return NA or else paste the non-NA elements in str_c
library(stringr)
library(dplyr)
library(tidyr)
df2 %>%
mutate(Body_text = sub("^.*Agent\\s\\w+", "", Body_text)) %>%
separate_rows(Body_text, sep="\\s(?=\\w+:)") %>%
separate(Body_text, into = c("Text_from", "value"), sep=":\\s?") %>%
na.omit %>%
group_by(Id, newgrp = str_c('Text_from_',
replace(Text_from, Text_from != "Customer", "Agent"))) %>%
mutate(rn = row_number()) %>%
pivot_wider(names_from = newgrp, values_from = value) %>%
group_by(Id) %>%
summarise(AgentName = first(Text_from),
Text_from_Agent = = if(all(is.na(Text_from_Agent))) NA_character_ else str_c(Text_from_Agent[!is.na(Text_from_Agent)]), collapse=' '),
Text_from_Customer = if(all(is.na( Text_from_Customer))) NA_character_ else str_c( Text_from_Customer[!is.na( Text_from_Customer)]), collapse=' '))

how can I add to groupings in group_by_at dplyr function

Trying to add a grouping to an tibble that has an existing grouping using dplyr, but the 'add' in group_by_at() doesn't seem to be working. Does anyone have any ideas why?
Example:
df <- data.frame(col1 = sample(letters,100,replace = T),
col2 = sample(letters,100,replace = T),
col3 = sample(letters,100,replace = T))
# group_by_at add doesn't work as desired:
df %>%
group_by(col1) %>%
group_by_at('col2', add = T) %>%
summarise(n = n())
# but group_by add does work as desired:
df %>%
group_by(col1) %>%
group_by(col2, add = T) %>%
summarise(n = n())
We need .add instead of add as the Usage suggested in ?group_by_at is
group_by_at(.tbl, .vars, .funs = list(), ..., .add = FALSE,
.drop = group_drops(.tbl))
df %>%
group_by(col1) %>%
group_by_at('col2', .add = TRUE) %>%
summarise(n = n())
NOTE: After the summarise step, one of the grouping variables are removed especially the last grouping variable

Merging and transforming two sets of data

I have two rather large data files which I need to merge into one, in the following way:
A <- tibble(
id=1:2,
firstName=c("Alice", "Bob")
)
B <- tibble(
id=c(1,1,2),
email=c("alice#wonder.land.com", "alice2#wonderland.com", "bob#builder.com")
)
desiredResult <- tibble(
id=1:2,
firstName=c("Alice", "Bob"),
email1=c("alice#wonderland.com", "bob#builder.com"),
email2=c("alice2#wonderland.com", NA)
)
How can this be done efficiently? I tried using spread() but did not succeed and could only hack together a bad solution:
notGood <-
inner_join(A, B, by = "id") %>%
split(., .$id) %>%
map_dfr(function(x) as.tibble(t(unlist(x)))) %>%
replace(is.na(.), "") %>%
unite(id, id1, id, sep = "") %>%
unite(firstName, firstName1, firstName, sep = "") %>%
unite(email, email1, email, sep = "") %>%
select(id, firstName, matches("email"))
EDIT:
The suggested solutions work great, but how could I apply them to more than one column? Like in this example:
A <- tibble(
id=1:2,
firstName=c("Alice", "Bob")
)
B <- tibble(
id=c(1,1,2),
email=c("alice#wonder.land.com", "alice2#wonderland.com", "bob#builder.com"),
phone=c("123", "456", "789")
)
desiredResult <- tibble(
id=1:2,
firstName=c("Alice", "Bob"),
email1=c("alice#wonderland.com", "bob#builder.com"),
email2=c("alice2#wonderland.com", NA),
phone1=c("123", "789"),
phone2=c("456", NA)
)
Simply adding more column names to the suggested answers doesn't quite work:
A %>%
left_join(B, by='id') %>%
group_by(id)%>%
mutate(rn=paste0('email',row_number())) %>%
mutate(rn2=paste0('phone',row_number())) %>%
spread(rn, email) %>%
spread(rn2, phone)
Check this solution:
B %>%
group_by(id) %>%
mutate(rn = paste0('email', row_number())) %>%
spread(rn, email) %>%
right_join(A) %>%
select(id, firstName, everything())
Answer to added question:
A %>%
left_join(
B %>%
gather(key, val, -id) %>%
group_by(id, key) %>%
mutate(key2 = paste0(key, row_number())) %>%
ungroup() %>%
select(-key) %>%
spread(key2, val)
)
desiredResult <-
A %>%
inner_join(B %>%
group_by(id) %>%
mutate(ColName = paste0("email",row_number())) %>%
ungroup() %>%
spread(ColName, email), by = "id")

Combine list of data frames with one column of characters

I am learning to get, cleaning and combining data. I am confused why in a loop rbind command result in returning 10 data instead of expected 30 data as when I combine it manually (i by i).
library(XML)
mergeal <- NULL
tabnums <- 3
for (i in 1:length(tabnums)) {
bnn <- paste0("http://www.ngchanmau.com/listing_browse.php?cur_page=",
tabnums[i], "&&coming=22-Oct-2015&coming=22-Oct-2015")
tem <- readHTMLTable(bnn, header=T, stringsAsFactors=F)
#data cleaning
ff <- tem[8] #wanted data
ff1 <- as.data.frame(ff)
ff2 <- ff1[ , 1] #get 1st col data only
ff3 <- unique(ff2)
ff4 <- ff3[c(2,5:13)] #wanted list only
#merging dataset
mergeal <- rbind(mergeal, ff4)
}
I've tried using list rbind list of data frames with one column of characters and numerics but still have the same result as above. Appreciate any help on what I missed, thanks.
I cleaned up the data cause I was bored.
library(plyr)
library(XML)
library(dplyr)
library(magrittr)
library(stringi)
library(tidyr)
library(lubridate)
answer =
data_frame(tabnums = 1:3) %>%
group_by(tabnums) %>%
do(.$tabnums %>%
paste0("http://www.ngchanmau.com/listing_browse.php?cur_page=",
., "&&coming=22-Oct-2015&coming=22-Oct-2015") %>%
readHTMLTable(header = T, stringsAsFactors = F) %>%
extract2(8)) %>%
ungroup %>%
select(V1) %>%
distinct %>%
mutate(V1 =
V1 %>%
stri_replace_all_fixed("Â", "\n") %>%
stri_replace_all_fixed("Type:", "\nType:") %>%
stri_replace_all_fixed("Time:", "\nTime:") %>%
stri_replace_all_fixed("Area:", "\nArea:") %>%
stri_split_fixed("\n")) %>%
unnest(V1) %>%
mutate(V1 = V1 %>% stri_trim) %>%
filter(V1 %>% stri_detect_regex("^There are currently") %>% `!`) %>%
filter(V1 != "") %>%
separate(V1, c("variable", "value"), sep = ":", fill = "left") %>%
mutate(variable = variable %>% mapvalues(NA, "Description"),
ID = variable %>% `==`("Description") %>% cumsum) %>%
spread(variable, value) %>%
mutate(Area = Area %>% extract_numeric,
Price = Price %>% extract_numeric,
Datetime =
Time %>%
stri_replace_all_fixed("a.m.", "am") %>%
stri_replace_all_fixed("p.m.", "pm") %>%
paste(Date, .) %>%
dmy_hm) %>%
select(-Date, -Time)

Resources