Removing the special symbols in data.frame column values - r

I have two data frame each with a column Name
df1:
name
#one2
!iftwo
there_2_go
come&go
df1 = structure(list(name = c("#one2", "!iftwo", "there_2_go", "come&go")),.Names = c("name"), row.names = c(NA, -4L), class = "data.frame")
df2:
name
One2
IfTwo#
there-2-go
come.go
df2 = structure(list(name = c("One2", "IfTwo#", "there-2-go", "come.go")),.Names = c("name"), row.names = c(NA, -4L), class = "data.frame")
Now to compare the two data frames for inequality is cumbersome because of special symbols using %in%. To remove the special symbols using stringR can be useful. But how exactly we can use stringR functions with %in% and display the mismatch between them
have already done the mutate() to convert all in lowercasestoLower()as follows
df1<-mutate(df1,name=tolower(df1$name))
df2<-mutate(df2,name=tolower(df2$name))
Current output of comparison:
df2[!(df2 %in% df1),]
[1] "one2" "iftwo#" "there-2-go" "come.go"
Expected output as essentially the contents are same but with special symbols:
df2[!(df2 %in% df1),]
character(0)
Question : How do we ignore the symbols in the contents of the Frame

Here it is in a function,
f1 <- function(df1, df2){
i1 <- tolower(gsub('[[:punct:]]', '', df1$name))
i2 <- tolower(gsub('[[:punct:]]', '', df2$name))
d1 <- sapply(i1, function(i) grepl(paste(i2, collapse = '|'), i))
return(!d1)
}
f1(df, df2)
# one2 iftwo there2go comego
# FALSE FALSE FALSE FALSE
#or use it for indexing,
df2[f1(df, df2),]
#character(0)

Related

getting the names of data frames from list in R

I have a list which contains 36 data frames. I want to create a list containing all the names of those data frames :
dput(myfiles[1:2])
list(structure(list(X.Treatment.1.Treatment.10.Treatment.2.Treatment.3.Treatment.4.Treatment.5.Treatment.6.Treatment.7.Treatment.8.Treatment.9 = c("Treatment.1,1,0.779269898976048,0.987582177817029,0.999865208543176,0.999637376053903,0.969316946773183,0.992798203986959,0.424960684181985,0.804869101320034,0.934784678841289",
"Treatment.10,0.779269898976048,1,0.671138248567996,0.789454098761072,0.762111859396959,0.909408486972833,0.848734212632234,-0.236126723371631,0.255300504533133,0.505840502482398",
"Treatment.2,0.987582177817029,0.671138248567996,1,0.984869671366683,0.991454531822078,0.918661911614817,0.961649044703906,0.561895346303209,0.888107698459535,0.978982111839266",
"Treatment.3,0.999865208543176,0.789454098761072,0.984869671366683,1,0.99906051831384,0.973222174821046,0.994631289318653,0.410041249133801,0.795017057233326,0.9288266084351",
"Treatment.4,0.999637376053903,0.762111859396959,0.991454531822078,0.99906051831384,1,0.962346166096083,0.989212254209048,0.449182113577399,0.820557713571369,0.944010924367408",
"Treatment.5,0.969316946773183,0.909408486972833,0.918661911614817,0.973222174821046,0.962346166096083,1,0.991784351747349,0.189407610662142,0.634294194129571,0.81878574572229",
"Treatment.6,0.992798203986959,0.848734212632234,0.961649044703906,0.994631289318653,0.989212254209048,0.991784351747349,1,0.31345701514879,0.72797778020465,0.885498274066011",
"Treatment.7,0.424960684181985,-0.236126723371631,0.561895346303209,0.410041249133801,0.449182113577399,0.189407610662142,0.31345701514879,1,0.879237827530393,0.718791431723663",
"Treatment.8,0.804869101320034,0.255300504533133,0.888107698459535,0.795017057233326,0.820557713571369,0.634294194129571,0.72797778020465,0.879237827530393,1,0.963182415401058",
"Treatment.9,0.934784678841289,0.505840502482398,0.978982111839266,0.9288266084351,0.944010924367408,0.81878574572229,0.885498274066011,0.718791431723663,0.963182415401058,1"
)), class = "data.frame", row.names = c(NA, -10L)), structure(list(
X.Treatment.1.Treatment.10.Treatment.2.Treatment.3.Treatment.4.Treatment.5.Treatment.6.Treatment.7.Treatment.8.Treatment.9 = c("Treatment.1,1,NA,NA,NA,NA,NA,NA,NA,NA,NA",
"Treatment.10,NA,1,NA,NA,NA,NA,NA,NA,NA,NA", "Treatment.2,NA,NA,1,NA,NA,NA,NA,NA,NA,NA",
"Treatment.3,NA,NA,NA,1,NA,NA,NA,NA,NA,NA", "Treatment.4,NA,NA,NA,NA,1,NA,NA,NA,NA,NA",
"Treatment.5,NA,NA,NA,NA,NA,1,NA,NA,NA,NA", "Treatment.6,NA,NA,NA,NA,NA,NA,1,NA,NA,NA",
"Treatment.7,NA,NA,NA,NA,NA,NA,NA,1,NA,NA", "Treatment.8,NA,NA,NA,NA,NA,NA,NA,NA,1,NA",
"Treatment.9,NA,NA,NA,NA,NA,NA,NA,NA,NA,1")), class = "data.frame", row.names = c(NA,
-10L)))
I want a list containing all the names of the data frames. The problem is that when I write:
names(list_median)[i]
It just returns NULL. Each data frame in the list is a correlation matrix that looks like this.
I am not understanding if this is it:
mat_names <- lapply(list_median, \(x) do.call(cbind, dimnames(x)))
mat_names <- lapply(mat_names, \(x) {colnames(x) <- c("Rows", "Cols"); x})
Here is a possible explanation why you are running into issues. The code is commented:
# extract each dataframe to global environment with this code
for (i in seq(list_median))
assign(paste0("df", i), list_median[[i]])
# you should see df1 and df2 etc.. in the Environment
# Now construct a list out of a few of df eg.df1 and df2 with a list of two dataframes:
my_list<- list(df1,df2)
# Now try to get the names
names(my_list)
# you will get NULL
# Now try this: name the dataframes like here and call the names:
my_list<- list(df1nownamed = df1, df2nownamed = df2)
names(my_list)
# and you will get:
[1] "df1nownamed" "df2nownamed"

Loop over a list of dataframes and change column names in R

I have a list of data frames in which some data frames are abit messed up with column names and my intention is to loop over the list of data frame columns, identify those data frames where the columns are messed up then be able to delete the column names and replace the first row to be column names, this is my data frames sample
dput(df)
structure(list(v1 = c("Silva", "Brandon", "Mango"),
v2 = c("James","Jane", "Egg")),
class = "data.frame", row.names = c(NA, -3L))
dput(df2)
structure(list(X2 = c("v1", "Brandon", "Mango"),
X..X1 = c("v2","Jane", "Egg")),
class = "data.frame", row.names = c(NA, -3L))
Now this is the example of my dataframes where we have a dataframe in which the column names in df2 are appearing as rows, I need to loop through see which dataframes have messed up column names like df2 then delete the column names and replace with first row this is what I tried
dflist <- list(df,df2)
remNames <- c("X2", "X..x1")
dflist <- c()
for (i in 1:length(dflist)) {
if(dflist[[i]][names(dflist[[i]])] == remNames){
colnames(dflist[[i]]) <- dflist[[i]][1,]
dflist[[i]] = dflist[[i]][-1, ]
}
}
This doesn't work, what am I missing out, my EXPECTED OUTPUT is the list of data frames to have same column names which are supposed to be V1 and V2
dflist <- list(df,df2)
for (i in 1:length(dflist)) {
if(any(names(dflist[[i]]) == remNames)){
colnames(dflist[[i]]) <- dflist[[i]][1,]
dflist[[i]] = dflist[[i]][-1, ]
}
}
dflist[[i]][names(dflist[[i]])] == remNames will check the enitre dataframe, hence if will return FALSE and nothing happend, consider the following example when i=2
> i=2
> dflist[[i]][names(dflist[[i]])] == remNames
X2 X..X1
[1,] FALSE FALSE
[2,] FALSE FALSE
[3,] FALSE FALSE
A better solution is to use grepl to see if the column names contain a .. or X, so the if becomes
if(any(grepl('\\.\\.|X',names(dflist[[i]])))){...}

subset df according nested list while there is a white space

I have a data frame and I would like to subset it according specific values. When I have tried to do it, there is problem because of the white space inside the values in sample_df$mentions.
I used this script for subsetting the data frame:
sample_list <- list()
for (i in colnames(sample_name)){
sample_list <- sapply(sample_df$mentions, function(x)any(x %in% sample_name[[i]]))
new_sample_df <- sample_df[sample_list,]
}
I have tried strsplit function to get rid of the space but it has created other problems.
sample_df$mentions <- strsplit(as.charater(sample_df$mentions),"[[:space:]]")
Thank you for your help in advance.
My expected outcome should be like this:
mentions screen_name
5 islambey1453, hamzayerlikaya, tahaayhan, hidoturkoglu15 ak_Furkan54
10 nurhandnci, SSSBBL777, serkanacar007, Chequevera06, kubilayy81 tanrica_gaia
sample_name reproducible data:
sample_name <- structure(list(Name = structure(2:1, .Label = c("hamzayerlikaya",
"SSSBBL777"), class = "factor")), row.names = c(NA, -2L), class = "data.frame")
sample_df reproducible data:
sample_df <- structure(list(mentions = list(character(0), "srgnsnmz92", character(0),
"Berivan_Aslan_", c("islambey1453", " hamzayerlikaya", " tahaayhan",
" hidoturkoglu15"), character(0), "themarginale", character(0),
character(0), c("nurhandnci", " SSSBBL777", " serkanacar007",
" Chequevera06", " kubilayy81")), screen_name = c("SaadetYakar",
"beraydogru", "EL_Turco_DLC", "hebunagel", "ak_Furkan54", "zaferakyol011",
"melmitem", "mobbingabla", "BekarKronik", "tanrica_gaia")), row.names = c(NA,
10L), class = "data.frame")
We can loop through the 'Name' and use that in grepl, Reduce it to a single logical vector and subset the rows of 'sample_df'
sample_df[Reduce(`|`, lapply(as.character(sample_name$Name),
grepl, x = sample_df$mentions)),]
# mentions screen_name
#5 islambey1453, hamzayerlikaya, tahaayhan, hidoturkoglu15 ak_Furkan54
#10 nurhandnci, SSSBBL777, serkanacar007, Chequevera06, kubilayy81 tanrica_gaia
NOTE: This would work with any length of 'Name' column
Another option is regex_inner_join
library(fuzzyjoin)
library(tidyverse)
regex_inner_join(sample_df, sample_name, by = c("mentions" = "Name")) %>%
select(mentions, screen_name)
# mentions screen_name
#1 islambey1453, hamzayerlikaya, tahaayhan, hidoturkoglu15 ak_Furkan54
#2 nurhandnci, SSSBBL777, serkanacar007, Chequevera06, kubilayy81 tanrica_gaia
Since mentions is a list we can use sapply and select only those rows in sample_df where any of the mentions has Name in it.
sample_df[sapply(sample_df$mentions, function(x) any(grepl(pattern, x))), ]
# mentions screen_name
#5 islambey1453, hamzayerlikaya, tahaayhan, hidoturkoglu15 ak_Furkan54
#10 nurhandnci, SSSBBL777, serkanacar007, Chequevera06, kubilayy81 tanrica_gaia
where pattern is
pattern = paste0("\\b", sample_name$Name, "\\b", collapse = "|")

Ordering columns of a data frame

I am interested to order columns of the data frame give below
structure(list(DETECTION = c(0.000219, 0.000673, 0.000322, 0.602006,
0.000468, 0.204022, 0.000491, 0.003067), VALUE = structure(1:8, .Label = c("10071_s_at",
"1053_at", "117_at", "1255_g_at", "1294_at", "1320_at", "1405_i_at",
"14312_at"), class = "factor")), .Names = c("DETECTION", "VALUE"
), class = "data.frame", row.names = c(NA, -8L))
I want numeric column (DETECTION) at the second.
I tried something here
d1 <- data[1, , drop = FALSE]
nums <- d1[, nn <- sapply(d1, is.numeric)]
ch <- d1[, !nn, drop = FALSE]
id <- names(ch[, grepl('_at$', as.character(unlist(ch))), drop = FALSE])
p <- names(nums)
d <- data[,c(id,p)]
However names(nums) returns NULL . What is going wrong here.
dt <- as.data.table(data)
From R help : " When it's required to reorder the columns of a data.table, the idiomatic way is to use setcolorder(x, neworder), instead of doing x <- x[, neworder, with=FALSE]. This is because the latter makes an entire copy of the data.table, which maybe unnecessary in most situations."
setcolorder(dt,c("VALUE","DETECTION"))
names(nums) is NULL because the dimensions were dropped. You can add the argument drop to keep the dimensions as they are:
names(nums)
#NULL
nums <- d1[, nn <- sapply(d1, is.numeric), drop=FALSE]
names(nums)
#[1] "DETECTION"

R: Building a list from matching values in a data.frame

I have a 3 column data frame which looks a little like this:
id name links
1 134235 dave "34657","34563","23459"
2 23459 mary "134235","45868","45677"
3 165432 jane "134235","23459","44657"
where id and name values are unique, and links is a string of ids which indicate an association with some of the names in each row. So for example dave includes the links id 23459 which is mary so dave is connected to mary. What I need to produce is a pair list of all the connections in the data so with the example data I would output something like:
dave,mary
mary,dave
jane,dave
jane,mary
Very new to R and seen amazing things done with methods like apply and before going off and trying to replicate a solution which would look more like a javascript routine and be very inefficient I wondered if anyone could help.
One solution, using Matt's dput():
tab <- structure(list(
id = c("134235", "23459", "165432"),
name = c("dave", "mary", "jane"),
links = c("'34657', '34563', '23459'",
"'134235', '45868', '45677'",
"'134235', '23459', '44657'")),
.Names = c("id", "name", "links"),
row.names = c(NA, -3L), class = "data.frame")
conns <- function(name, links) {
paste(name, tab$name[tab$id %in% as.numeric(unlist(strsplit(gsub('\'|\"',
'', links), ',')))], sep=',')
}
connections <- unname(unlist(mapply(conns, tab$name, tab$links,
SIMPLIFY=FALSE)))
The first step should be to normalize the data, in particular, parse the strings.
You can use ddply: it applies a function
that takes a chunk of a data.frame (a row, in our case)
and transforms it in some way. You just have to write a function
that works on one row, i.e., on one string.
# Sample data
n <- 10
k <- 3
ids <- as.character(unique(round(1e5*runif(n))))
n <- length(ids)
names <- LETTERS[1:n]
links <- lapply( ids, function(u)
sample(setdiff(ids,u),k,replace=FALSE) )
links <- sapply( links, function(u)
paste( '"', paste(u,collapse='","'), '"', sep="" ) )
d <- data.frame(
id=ids,
name=names,
links=links,
stringsAsFactors=FALSE
)
library(plyr)
library(stringr)
dd <- ddply(
d,
c("id", "name"),
function(u) data.frame(
id=u$id,
name=u$name,
link=unlist(str_split( str_replace_all( u$links, '"', '' ), "," ))
))
You can then join the data, either with merge or sqldf.
library(sqldf)
sqldf("
SELECT A.name, B.name
FROM dd AS A, d AS B
WHERE A.link = B.id
")
dat<- structure(list(
id = c("134235", "23459", "165432"),
name = c("dave", "mary", "jane"),
links = c("'34657', '34563', '23459'",
"'134235', '45868', '45677'",
"'134235', '23459', '44657'")),
.Names = c("id", "name", "links"),
row.names = c(NA, -3L), class = "data.frame")
# It can all be done in base, of course...
library(stringr)
library(reshape2)
# This would be easy to do if links weren't in that format -
# one record per id-link pair would be preferable.
# Split dat$links and remove any quotes
dat.wider <- data.frame(
dat[ , c("id", "name")],
str_split_fixed(string = gsub(dat$links,
pattern = "['|\"]",
replace = ""),
pattern = ", ",
n = 3)
)
# Reshape
dat.long <- melt(dat.wider, id.var = c("id", "name"))
# Self-join - this is not quite the right method, but I'm just not
# thinking straight right now
dat.joined <- unique(merge(x = dat.long[ , c("name", "value")],
y = dat.long[ , c("id", "name")],
by.x = "value",
by.y = "id"
))
# And, finally, if you wanted vector output...
res <- with(dat.joined, paste(name.x, name.y, sep = ", "))

Resources