passing particular domain from column - r

I am trying to pass particular domain else all the values replace with NULL
if BB= TRUE, or If BB = False then all the values should be there.
df6 <- data.frame(name=c("try,xab","xab,Lan","mhy,mun","vgtu,mmc","dgsy,aaf","kull,nnhu","hula,njam","mund,jiha","htfy,ntha","bhr,gydbt","sgyu,hytb","vdti,kula","mftyu,huta","ibdy,vcge","cday,bhsue","ajtu,nudj"),
email=c("xab.try#ybcd.com","Lan.xab#ybcd.com","tth.vgu#ybcd.com","mmc.vgtu#ybcd.com","aaf.dgsy#partnt.com","nnhu.kull#ybcd.com","njam.hula#ybcd.com","jiha.mund#ybcd.com","ntha.htfy#ybcd.com","gydbt.bhr#ybcd.com","hytb.sgyu#ybcd.com","kula.vdti#ybcd.com","huta.mftyu#ybcd.com","ggat.khul#ybcd.com","bhsue.cday#ybcd.com","nudj.ajtu#ybcd.com"))
BB=TRUE
col_drop <- c("partnt.com")
df6 <- ifelse(BB==TRUE,
df6 <- df6[ , !(names(df6) %in% col_drop)],df6) %>% as.data.frame()
the output should be like

This works for me :)
library(dplyr, warn.conflicts = FALSE)
df6 <- data.frame(name=c("try,xab","xab,Lan","mhy,mun","vgtu,mmc","dgsy,aaf","kull,nnhu","hula,njam","mund,jiha","htfy,ntha","bhr,gydbt","sgyu,hytb","vdti,kula","mftyu,huta","ibdy,vcge","cday,bhsue","ajtu,nudj"),
email=c("xab.try#ybcd.com","Lan.xab#ybcd.com","tth.vgu#ybcd.com","mmc.vgtu#ybcd.com","aaf.dgsy#partnt.com","nnhu.kull#ybcd.com","njam.hula#ybcd.com","jiha.mund#ybcd.com","ntha.htfy#ybcd.com","gydbt.bhr#ybcd.com","hytb.sgyu#ybcd.com","kula.vdti#ybcd.com","huta.mftyu#ybcd.com","ggat.khul#ybcd.com","bhsue.cday#ybcd.com","nudj.ajtu#ybcd.com"))
col_drop <- c("partnt.com")
mutate(df6, email = if_else(grepl(col_drop, email), email, NULL))
#> name email
#> 1 try,xab <NA>
#> 2 xab,Lan <NA>
#> 3 mhy,mun <NA>
#> 4 vgtu,mmc <NA>
#> 5 dgsy,aaf aaf.dgsy#partnt.com
#> 6 kull,nnhu <NA>
#> 7 hula,njam <NA>
#> 8 mund,jiha <NA>
#> 9 htfy,ntha <NA>
#> 10 bhr,gydbt <NA>
#> 11 sgyu,hytb <NA>
#> 12 vdti,kula <NA>
#> 13 mftyu,huta <NA>
#> 14 ibdy,vcge <NA>
#> 15 cday,bhsue <NA>
#> 16 ajtu,nudj <NA>
Created on 2020-09-27 by the reprex package (v0.3.0)

Does this work:
> df6[!grepl('partnt.com', df6$email), 'email'] <- NA
> df6
name email
1 try,xab <NA>
2 xab,Lan <NA>
3 mhy,mun <NA>
4 vgtu,mmc <NA>
5 dgsy,aaf aaf.dgsy#partnt.com
6 kull,nnhu <NA>
7 hula,njam <NA>
8 mund,jiha <NA>
9 htfy,ntha <NA>
10 bhr,gydbt <NA>
11 sgyu,hytb <NA>
12 vdti,kula <NA>
13 mftyu,huta <NA>
14 ibdy,vcge <NA>
15 cday,bhsue <NA>
16 ajtu,nudj <NA>
>

below should work:
library(data.table)
setDT(df6)
BB <- TRUE
domain_to_keep <- "partnt.com"
df6[BB & !grepl(paste0("#", domain_to_keep, "$"), email) , email := "" ]

Related

R construct a data frame using 2 column

I have a data frame like:
df
group group_name value
1 1 <NA> VV0001
2 1 <NA> VV_RS00280
3 2 <NA> VV0002
4 2 <NA> VV_RS00285
5 3 <NA> VV0003
6 3 <NA> VV_RS00290
7 5 <NA> VV0004
8 5 <NA> VV_RS00295
9 6 <NA> VV0005
10 6 <NA> VV_RS00300
11 7 <NA> VV0006
12 7 <NA> VV_RS00305
13 8 <NA> VV0007
14 8 <NA> VV_RS00310
15 9 <NA> VV0009
16 9 <NA> VV_RS00315
17 10 <NA> VV0011
18 10 <NA> VV_RS00320
19 11 <NA> VV0012
20 11 <NA> VV_RS00325
21 12 <NA> VV0013
22 12 <NA> VV_RS00330
so I want to construct an other data frame using the columns "group" and "value", all the group 1 (df[df$group == 1,]) will get the data in "value" column (VV0001, VV_RS00280) and construct the data.frame like:
group value
1 VV0001 VV_RS00280
and then the next df[df$group == 2,], and so on, at the end will be:
group value
1 VV0001 VV_RS00280
2 VV0002 VV_RS00285
3 VV0003 VV_RS00290
4 VV0004 VV_RS00295
I tried to do it manually but the nrow(df) is big, > 3000 or more !!
Thanks
You may try,
library(dplyr)
library(tidyr)
df %>%
rename(idv = group) %>%
mutate(group_name = rep(c("group", "value"),n()/2)) %>%
group_by(idv) %>%
pivot_wider(names_from = group_name, values_from = value) %>%
ungroup %>%
select(-idv)
group value
<chr> <chr>
1 VV0001 VV_RS00280
2 VV0002 VV_RS00285
3 VV0003 VV_RS00290
4 VV0004 VV_RS00295
5 VV0005 VV_RS00300
6 VV0006 VV_RS00305
7 VV0007 VV_RS00310
8 VV0009 VV_RS00315
9 VV0011 VV_RS00320
10 VV0012 VV_RS00325
11 VV0013 VV_RS00330

Add column from df2 to df1 based on match between df1 and df2

I have two data sets df1 and df2, which have one column "ID" and "Country" in common:
df1 <- data.frame(ID=c(1:20), State=c("NA","NA","NA","NA","NA","NA","NA","NA","NA","NA","CA","IL","SD","NC","SC","WA","CO","AL","AK","HI"))
df2 <- data.frame(ID=c(1,2,3,4,5,"NA","NA","NA","NA","NA"), Year=c("2020","2021","2020","2020","2021","2020","2020","2021","2020","2019"),State=c("NA","NA","NA","NA","NA","CA","SC","NY","NJ","OR"))
How can I add Year from df2 to df1 to the same ID that exists in df1 OR the same State that exists in df1?
The reason why I want to make this change: I just need to add this "Year" information from df2 to df1.
Here's a dplyr solution:
library(dplyr)
df1 <- df1 %>%
mutate(join = ifelse(State == 'NA', ID, State))
df2 <- df2 %>%
mutate(join = ifelse(State == 'NA', ID, State))
df_new <- left_join(df1, df2, by = "join") %>%
mutate(State = coalesce(State.x, State.y)) %>%
select(-c(State.x, State.y, join, ID.y)) %>%
rename(ID = ID.x)
This gives us:
ID Year State
1 1 2020 NA
2 2 2021 NA
3 3 2020 NA
4 4 2020 NA
5 5 2021 NA
6 6 <NA> NA
7 7 <NA> NA
8 8 <NA> NA
9 9 <NA> NA
10 10 <NA> NA
11 11 2020 CA
12 12 <NA> IL
13 13 <NA> SD
14 14 <NA> NC
15 15 2020 SC
16 16 <NA> WA
17 17 <NA> CO
18 18 <NA> AL
19 19 <NA> AK
20 20 <NA> HI
You could do:
df1 <- type.convert(df1)
df2 <- type.convert(df2)
df1 %>%
left_join(select(df2, -State), 'ID') %>%
left_join(select(filter(df2, is.na(ID)), -ID), 'State') %>%
mutate(Year = coalesce(Year.x, Year.y), Year.x = NULL, Year.y = NULL)
ID State Year
1 1 <NA> 2020
2 2 <NA> 2021
3 3 <NA> 2020
4 4 <NA> 2020
5 5 <NA> 2021
6 6 <NA> NA
7 7 <NA> NA
8 8 <NA> NA
9 9 <NA> NA
10 10 <NA> NA
11 11 CA 2020
12 12 IL NA
13 13 SD NA
14 14 NC NA
15 15 SC 2020
16 16 WA NA
17 17 CO NA
18 18 AL NA
19 19 AK NA
20 20 HI NA

Dropdown all list and collect data rcurl

From a page like this
https://stackoverflow.com/users/11786778/nathalie?tab=reputation
How is it possible to dropdown all list from reputation table and receive the information which is loaded in the process of load?
Is this what you want?
library(rvest)
library(magrittr)
library(plyr)
#Doing URLs one by one
url<-"https://stackoverflow.com/users/11786778/nathalie?tab=reputation"
##GET SALES DATA
pricesdata <- read_html(url) %>% html_nodes(xpath = "//table[1]") %>% html_table(fill=TRUE)
library(plyr)
df <- ldply(pricesdata, data.frame)
Produces:
1 <NA>
2 Take the result of call for a list of ids
3 <NA>
4 <NA>
5 <NA>
6 Add Detailed history
7 <NA>
8 <NA>
9 <NA>
10 <NA>
11 <NA>
12 <NA>
13 <NA>
14 <NA>
15 <NA>
16 <NA>
17 <NA>
18 <NA>
19 <NA>
20 <NA>
21 <NA>
22 <NA>
23 <NA>
24 <NA>
25 <NA>
26 <NA>
27 <NA>
28 <NA>
29 <NA>
30 <NA>
31 <NA>
32 <NA>
33 <NA>
34 <NA>
35 <NA>
36 <NA>
37 <NA>
38 <NA>
39 <NA>
40 <NA>
41 <NA>
42 <NA>
43 <NA>
>

How can I tokenize a string in R?

I am trying to calculate readability, but it seems everything is written to expect either a file path or a Corpus. How do I handle a string?
Error (on the tokenization step):
Error: Unable to locate
I tried:
str<-"Readability zero one. Ten, Eleven.", "The cat in a dilapidated tophat."
library(koRpus)
ll.tagged <- tokenize(str, lang="en")
readability(ll.tagged,measure="Flesch.Kincaid")
You need to download the language file
install.koRpus.lang(c("en"))
library(koRpus.lang.en)
ll.tagged <- tokenize(str, format = "obj", lang = "en")
ll.tagged
doc_id token tag lemma lttr wclass desc stop stem idx sntc
1 <NA> Readability word.kRp 11 word <NA> <NA> <NA> 1 1
2 <NA> zero word.kRp 4 word <NA> <NA> <NA> 2 1
3 <NA> one word.kRp 3 word <NA> <NA> <NA> 3 1
4 <NA> . .kRp 1 fullstop <NA> <NA> <NA> 4 1
5 <NA> Ten word.kRp 3 word <NA> <NA> <NA> 5 2
6 <NA> , ,kRp 1 comma <NA> <NA> <NA> 6 2
[...]
10 <NA> cat word.kRp 3 word <NA> <NA> <NA> 10 3
11 <NA> in word.kRp 2 word <NA> <NA> <NA> 11 3
12 <NA> a word.kRp 1 word <NA> <NA> <NA> 12 3
13 <NA> dilapidated word.kRp 11 word <NA> <NA> <NA> 13 3
14 <NA> tophat word.kRp 6 word <NA> <NA> <NA> 14 3
15 <NA> . .kRp 1 fullstop <NA> <NA> <NA> 15 3

How can I transform a long-formated data frame to a wide-formated one with multiple values within a cell in R?

Here is the original df:
area sector item
1 East A <NA>
2 South A Baidu
3 South A Tencent
4 West A <NA>
5 North A <NA>
6 East B Microsoft
7 East B Google
8 East B Facebook
9 South B <NA>
10 West B <NA>
11 North B <NA>
12 East C <NA>
13 South C <NA>
14 West C <NA>
15 North C Alibaba
16 East D <NA>
17 South D <NA>
18 West D Amazon
19 North D <NA>
20 East E <NA>
21 South E <NA>
22 West E <NA>
23 North E <NA>
How can I transform the above df to the following one? Some cells in the transformed df have multiple items from the original df.
Sector East South West North
1 A <NA> "Baidu, Tencent" <NA> <NA>
2 B "Microsoft, Google, Facebook" <NA> <NA> <NA>
3 C <NA> <NA> <NA> "Alibaba"
4 D <NA> <NA> "Amazon" <NA>
5 E <NA> <NA> <NA> <NA>
A quick solution could be to use the toString function while trasnforming from long to wide using the reshape2 package
reshape2::dcast(df, sector ~ area, toString)
#Using item as value column: use value.var to override.
# sector East North South West
# 1 A <NA> <NA> Baidu, Tencent <NA>
# 2 B Microsoft, Google, Facebook <NA> <NA> <NA>
# 3 C <NA> Alibaba <NA> <NA>
# 4 D <NA> <NA> <NA> Amazon
# 5 E <NA> <NA> <NA> <NA>
This is almost a dupe of this but most of the solutions there won't work for this case- but this can still give you some ideas.
And just for fun, here is a base solution:
reshape(aggregate(item ~ area + sector, data = df, paste, collapse = ","),
idvar = "sector", timevar = "area", direction = "wide")
sector item.East item.North item.South item.West
1 A <NA> <NA> Baidu,Tencent <NA>
5 B Microsoft,Google,Facebook <NA> <NA> <NA>
9 C <NA> Alibaba <NA> <NA>
13 D <NA> <NA> <NA> Amazon
17 E <NA> <NA> <NA> <NA>
Here is an option with dplyr/tidyr
library(dplyr)
library(tidyr)
df1 %>%
group_by(area, sector) %>%
summarise(item = toString(item)) %>%
spread(area, item)

Resources