Combine list of data frames with one column of characters - r

I am learning to get, cleaning and combining data. I am confused why in a loop rbind command result in returning 10 data instead of expected 30 data as when I combine it manually (i by i).
library(XML)
mergeal <- NULL
tabnums <- 3
for (i in 1:length(tabnums)) {
bnn <- paste0("http://www.ngchanmau.com/listing_browse.php?cur_page=",
tabnums[i], "&&coming=22-Oct-2015&coming=22-Oct-2015")
tem <- readHTMLTable(bnn, header=T, stringsAsFactors=F)
#data cleaning
ff <- tem[8] #wanted data
ff1 <- as.data.frame(ff)
ff2 <- ff1[ , 1] #get 1st col data only
ff3 <- unique(ff2)
ff4 <- ff3[c(2,5:13)] #wanted list only
#merging dataset
mergeal <- rbind(mergeal, ff4)
}
I've tried using list rbind list of data frames with one column of characters and numerics but still have the same result as above. Appreciate any help on what I missed, thanks.

I cleaned up the data cause I was bored.
library(plyr)
library(XML)
library(dplyr)
library(magrittr)
library(stringi)
library(tidyr)
library(lubridate)
answer =
data_frame(tabnums = 1:3) %>%
group_by(tabnums) %>%
do(.$tabnums %>%
paste0("http://www.ngchanmau.com/listing_browse.php?cur_page=",
., "&&coming=22-Oct-2015&coming=22-Oct-2015") %>%
readHTMLTable(header = T, stringsAsFactors = F) %>%
extract2(8)) %>%
ungroup %>%
select(V1) %>%
distinct %>%
mutate(V1 =
V1 %>%
stri_replace_all_fixed("Â", "\n") %>%
stri_replace_all_fixed("Type:", "\nType:") %>%
stri_replace_all_fixed("Time:", "\nTime:") %>%
stri_replace_all_fixed("Area:", "\nArea:") %>%
stri_split_fixed("\n")) %>%
unnest(V1) %>%
mutate(V1 = V1 %>% stri_trim) %>%
filter(V1 %>% stri_detect_regex("^There are currently") %>% `!`) %>%
filter(V1 != "") %>%
separate(V1, c("variable", "value"), sep = ":", fill = "left") %>%
mutate(variable = variable %>% mapvalues(NA, "Description"),
ID = variable %>% `==`("Description") %>% cumsum) %>%
spread(variable, value) %>%
mutate(Area = Area %>% extract_numeric,
Price = Price %>% extract_numeric,
Datetime =
Time %>%
stri_replace_all_fixed("a.m.", "am") %>%
stri_replace_all_fixed("p.m.", "pm") %>%
paste(Date, .) %>%
dmy_hm) %>%
select(-Date, -Time)

Related

comparing the variables and their values between two data frames

I have two data frames with same kind of data, now i want to check for all the columns in both data frames have same kind of text in all columns in both data frames .
so for example the column name "sales executives" in both data frames have exact name "Micheal klay" in both data frames but if there is any spelling error or extra space i want to show it as not matching.
I have tried below approach and its working for small database but because my data is very big, data having approx 10 - 40 millions or records so its showing error
do we have any solution or any other approach to do that
cannot allocate vector of size 3.2GB
library(tidyverse)
df1 <- data.frame(MAN=c(6,6,4,6,8,6,8,4,4,6,6,8,8),MANi=c("OD","NY","CA","CA","OD","CA","OD","NY","OL","NY","OD","CA","OD"),
nune=c("akas","mani","juna","mau","nuh","kil","kman","nuha","huna","kman","nuha","huna","mani"),
klay=c(1,2,2,1,1,2,1,2,1,2,1,1,2),emial=c("dd","xyz","abc","dd","xyz","abc","dd","xyz","abc","dd","xyz","abc","dd"),Pass=c("Low","High","Low","Low","High","Low","High","High","Low","High","High","High","Low"),fri=c("KKK","USA","IND","SRI","PAK","CHI","JYP","TGA","KKK","USA","IND","SRI","PAK"),
mkl=c("m","f","m","m","f","m","m","f","m","m","f","m","m"),kin=c("Sent","Rec","Sent","Rec","Sent","Rec","Sent","Rec","Sent","Rec","Rec","Sent","Rec"),munc=c("Car","Bus","Truk","Cyl","Bus","Car","Bus","Bus","Bus","Car","Car","Cyl","Car"),
lone=c("Sr","jun","sr","jun","man","man","jr","Sr","jun","sr","jun","man","man"),wond=c("tko","kent","bho","kilt","kent","bho","kent","bho","bho","kilt","kent","bho","kilt"))
df2 <- data.frame(MAN=c(6,6,4,6,8,6,8,4,4,6,6,8,8,8,6),MANi=c("OD","NY","CA","CA","OD","CA","OD","NY","OL","ny","OD","CA","OD","NY","OL"),
nune=c("akas","mani","juna","mau","nuh","kil","kman","nuha","huna","kman","nuha","huna","mani","juna","mau"),
klay=c(1,2,2,1,1,2,1,2,1,2,1,1,2,2,1),emial=c("dd","xyz","ABC","dd","xyz","ABC","dd","xyz","ABC","dd","xyz","ABC","dd","xyz","ABC"),Pass=c("Low","High","Low","Low","High","Low","High","High","Low","High","High","High","Low","High","High"),fri=c("KKK","USA","IND","SRI","PAK","CHI","JYP","TGA","KKK","USA","IND","SRI","PAK","CHI","JYP"),
mkl=c("male","female","male","male","female","male","male","female","male","male","female","male","male","female","male"),kin=c("Sent","Rec","Sent","Rec","Sent","Rec","Sent","Rec","Sent","Rec","Rec","Sent","Rec","Sent","Rec"),munc=c("Car","Bus","Truk","Cyl","Bus","Car","Bus","Bus","Bus","Car","Car","Cyl","Car","Bus","Bus"),
lone=c("Sr","jun","sr","jun","man","man","jr","Sr","jun","sr","jun","man","man","jr","man"),wond=c("tko","kent","bho","kilt","kent","bho","kent","bho","bho","kilt","kent","bho","kilt","kent","bho"))
df1_long <- df1 %>%
as_tibble() %>%
mutate_if(is.double, as.character) %>% distinct() %>%
pivot_longer(everything(), names_to = "Names", values_to = "options") %>%
arrange(Names, options)
df2_long <- df2 %>%
as_tibble() %>%
mutate_if(is.double, as.character) %>% distinct() %>%
pivot_longer(everything(), names_to = "Names", values_to = "options") %>%
arrange(Names, options)
T1 <- df1_long %>%
full_join(df2_long, by=c("Names", "options"), keep = TRUE) %>%
distinct(Names.x, options.x, Names.y, options.y) %>%
arrange(Names.x, Names.y, options.x, options.y) %>%
mutate(
consistant_names = !is.na(Names.x) & !is.na(Names.y),
consistant_options = !is.na(options.x) & !is.na(options.y)
)
the output required like below
below are inconsistency between data bases

Formatting of Data Frames in R

I have a data.frame with the following structure:
What I need is that in case that a value in the first column occures more than once, all corresponding entries in column V18 are concluded in one cell.
I applied the folling code.
p <- function(v) {
Reduce(f=paste0, x = v)
}
Data %>%
group_by(V1) %>%
summarise(test = p(as.character(V18))) %>%
merge(., M_TEST, by = 'V1') %>%
select(V1, V18, test)
It gives:
What I need is that instead of 4344, it is {43,44}.
How can I do this?
Thank you really much for your help!
Sincerely
Try This:
Data %>%
group_by(V1) %>%
summarise(test = p(as.character(V18))) %>%
merge(., M_TEST, by = 'V1') %>%
select(V1, V18, test) %>%
mutate(test = str_remove_all(test, pattern = "NA")) %>%
mutate(test = formatC(as.numeric(test), big.mark=",", big.interval = 2L)) %>%
mutate(test = paste0("{", test, "}"))
Edit: For Multiple Columns, this should work:
Data %>%
group_by(V1) %>%
summarise_at(vars(V2:V18), paste0, collapse="") %>%
mutate_at(vars(V2:V18), str_remove_all, pattern = "NA") %>%
mutate_at(vars(V2:V18), as.numeric) %>%
mutate_at(vars(V2:V18), formatC, big.mark=",", big.interval = 2L)

add summary `n` from one dataframe to another dataframe (tidyverse)

I was wondering if there might be a way to replace the column fpc in DATA2 with corresponding fpc obtained from DATA1?
library(tidyverse)
dat <- read.csv('https://raw.githubusercontent.com/rnorouzian/d/master/su.csv')
## 10000 rows ################
DATA1 <- dat %>%
group_by(across(all_of(c("gender", "pre")))) %>%
summarise(n = n(), .groups = 'drop') %>%
mutate(fpc = n/sum(n)) %>%
right_join(dat)
dat2 <- read.csv('https://raw.githubusercontent.com/rnorouzian/d/master/out.csv')
## 200 rows #################
DATA2 <- dat2 %>%
group_by(across(all_of(c("gender", "pre")))) %>%
summarise(n = n(), .groups = 'drop') %>%
mutate(fpc = n/sum(n)) %>%
right_join(dat2)
You can join the dataframe and use coalesce to select fpc from DATA2.
library(dplyr)
result <- DATA2 %>%
left_join(DATA1 %>% distinct(gender, pre, fpc),
by = c('gender', 'pre')) %>%
mutate(fpc = coalesce(fpc.y, fpc.x)) %>%
select(names(DATA2))
nrow(result)
#[1] 200
It would be more efficient to do this in data.table
library(data.table)
setDT(DATA2)[as.data.table(unique(DATA1[c('gender', 'pre', 'fpc')])),
fpc := i.fpc, on = .(gender, pre)]

Use dplyr to get index of first column with certain value per group or row

I have the following script. Option 1 uses a long format and group_by to identify the first step of many where the status equals 0.
Another option (2) is to use apply to calculate this value for each row, and then transform the data to a long format.
The firs option does not scale well. The second does, but I was unable to get it into a dplyr pipe. I tried to solve this with purrr but did not succeeed.
Questions:
Why does the first option not scale well?
How can I transform the second option in a dplyr pipe?
require(dplyr)
require(tidyr)
require(ggplot2)
set.seed(314)
# example data
dat <- as.data.frame(matrix(sample(c(0,1),
size = 9000000,
replace = TRUE,
prob = c(5,95)),
ncol = 9))
names(dat) <- paste("step",1:9, sep="_")
steps <- dat %>% select(starts_with("step_")) %>% names()
# option 1 is slow
dat.cum <- dat %>%
mutate(id = row_number()) %>%
gather(step, status,-id) %>%
group_by(id) %>%
mutate(drop = min(if_else(status==0,match(step, steps),99L))) %>%
mutate(status = if_else(match(step, steps)>=drop,0,1))
ggplot(dat.cum, aes(x = step, fill = factor(status))) +
geom_bar()
# option 2 is faster
dat$drop <- apply(dat,1,function(x) min(which(x==0),99))
dat.cum <- dat %>%
gather(step,status,-drop) %>%
mutate(status = if_else(match(step,steps)>=drop,0,1))
ggplot(dat.cum, aes(x = step, fill = factor(status))) +
geom_bar()
If you would like to map along rows you could do:
dat %>%
mutate(drop2 = map_int(seq_len(nrow(dat)), ~ min(which(dat[.x, ] == 0L), 99L)))
It could be that "gathering and grouping" is faster than Looping:
dat %>%
as_tibble() %>%
select(starts_with("step_")) %>%
mutate(row_nr = row_number()) %>%
gather(key = "col", value = "value", -row_nr) %>%
arrange(row_nr, col) %>%
group_by(row_nr) %>%
mutate(col_index = row_number()) %>%
filter(value == 0) %>%
summarise(drop3 = min(col_index)) %>%
ungroup() %>%
right_join(dat %>%
mutate(row_nr = row_number()),
by = "row_nr") %>%
mutate(drop3 = if_else(is.na(drop3), 99, drop3))

How do I use arrange inside a function?

I am trying to create a user-defined function which carries out some data transformations.
Mock data:
library(tidyverse)
set.seed(1)
sampledata_a <- data.frame(
patientid = sample(1:100),
servicetype = sample(c("service1", "service2", "service3", "service4", "service5"), 100, replace=TRUE),
date = sample(seq(as.Date('1999/01/01'), as.Date('2000/01/01'), by="day"), 100)
)
sampledata_b <- data.frame(
patientid = sample(1:100),
servicetype = sample(c("service6", "service7", "service8", "service9", "service10"), 100, replace=TRUE),
date = sample(seq(as.Date('1999/01/01'), as.Date('2000/01/01'), by="day"), 100)
)
sampledata1 <- rbind(sampledata_a, sampledata_b)
User-defined function:
get_most_recent_and_unique <- function(inputdata, groupbyvar, uniquevar, datevar) {
# first selects the most recent observation for each unique variable combination
outputdata <- inputdata %>%
distinct() %>%
arrange(groupbyvar, uniquevar, desc(datevar)) %>%
mutate(orderkey = paste0(groupbyvar, uniquevar, sep = "")) %>%
group_by(orderkey) %>%
do(head(., n=1)) %>%
ungroup() %>%
arrange(groupbyvar, desc(datevar), uniquevar)
# then tranpose from long to wide, and unite variables other than first variable into one
outputdata <- outputdata %>%
select(groupbyvar, uniquevar) %>%
group_by(groupbyvar) %>%
mutate(pos=1:n()) %>%
spread(pos, uniquevar) %>%
unite(uniquevar, -groupbyvar, sep=" / ")
return(outputdata)
}
When running the function as below:
outputdata <- get_most_recent_and_unique(sampledata1, "patientid", "servicetype", "date")
Following error message:
Error in arrange_impl(.data, dots) :
incorrect size (1) at position 1, expecting : 100
However, the code works fine when outside the user-defined function. I wonder if anyone can tell me what is wrong?
testoutputdata <- sampledata1 %>%
distinct() %>%
arrange(patientid, servicetype, desc(date)) %>%
mutate(orderkey = paste0(patientid, servicetype, sep = "")) %>%
group_by(orderkey) %>%
do(head(., n=1)) %>%
ungroup() %>%
arrange(patientid, desc(date), servicetype)
testoutputdata <- testoutputdata %>%
select(patientid, servicetype) %>%
group_by(patientid) %>%
mutate(pos=1:n()) %>%
spread(pos, servicetype) %>%
unite(servicetype, -patientid, sep=" / ")
Try this:
get_most_recent_and_unique <- function(inputdata, groupbyvar, uniquevar, datevar) {
groupbyvar <- enquo(groupbyvar)
uniquevar <- enquo(uniquevar)
datevar <- enquo(datevar)
# first selects the most recent observation for each unique variable combination
outputdata <- inputdata %>%
distinct() %>%
arrange(!! groupbyvar, !! uniquevar, desc(!! datevar)) %>%
mutate(orderkey := paste0(!! groupbyvar, !! uniquevar, sep = "")) %>%
group_by(orderkey) %>%
do(head(., n=1)) %>%
ungroup() %>%
arrange(!! groupbyvar, desc(!! datevar), !! uniquevar)
# then tranpose from long to wide, and unite variables other than first variable into one
outputdata <- outputdata %>%
select(!! groupbyvar, !! uniquevar) %>%
group_by(!! groupbyvar) %>%
mutate(pos=1:n()) %>%
spread(pos, !! uniquevar) %>%
unite(!! uniquevar, -!! groupbyvar, sep=" / ")
return(outputdata)
}
outputdata <- get_most_recent_and_unique(sampledata1, patientid, servicetype, date) # No quotation with arguments!
Here is the output:
patientid servicetype
<int> <chr>
1 1 service7 / service3
2 2 service10 / service1
3 3 service4 / service9
4 4 service8 / service3
5 5 service6 / service1
It seems to match your expectations when I compare them:
all.equal(outputdata, testoutputdata)
[1] TRUE
Note that you shouldn't quote the arguments when specifying the function, i.e. outputdata <- get_most_recent_and_unique(sampledata1, patientid, servicetype, date) will work while outputdata <- get_most_recent_and_unique(sampledata1, "patientid", "servicetype", "date") won't.

Resources