dataframe row to string with column names - r

My df
name age
tom 21
mary 42
How can I combine each row to something like
name:tom,age:21
name:mary,age:42
the output can be a list of strings.

A more general approach using apply.
apply(df1, 1, function(x) {n <- names(df1); paste0(n[1],":",x[1],",", n[2],":",x[2], collapse = "")})
here is a super general version:
df1<-
structure(list(name = c("tom", "mary"), age = c(21L, 42L), cool = c("yes",
"no")), row.names = c(NA, -2L), class = "data.frame")
apply(
apply(df1, 1, function(x) {n <- names(df1); paste0(paste(n,x, sep = ":"))}),
2,
paste0, collapse = ","
)
# "name:tom,age:21,cool:yes" "name:mary,age:42,cool:no"

Try with thispaste combination:
df$new.col <- paste(paste(colnames(df)[1], df$name, sep = ":"),
paste(colnames(df)[2], df$age, sep = ":"),
sep = ",")
# output
# name age new.col
#1 tom 21 name:tom,age:21
#2 mary 42 name:mary,age:42

I have some sample data, such as:
name=c("ali","asgar","ahmad","aslam","alvi")
age=c(12,33,23,16,34)
mydf=data.frame(name,age)
Data frame looking is as
> mydf
name age
1 ali 12
2 asgar 33
3 ahmad 23
4 aslam 16
5 alvi 34
Now make a list object and fill it.
mylist=list()
for(i in 1:nrow(mydf))
{
a=as.integer(mydf$age[i])
n=as.String(mydf$name[i])
mylist[i]=paste(paste(paste("name",n,sep = ":"),"age",sep = ","),a,sep = ":")
}
Finall, result is
> mylist
[[1]]
[1] "name:ali,age:12"
[[2]]
[1] "name:asgar,age:33"
[[3]]
[1] "name:ahmad,age:23"
[[4]]
[1] "name:aslam,age:16"
[[5]]
[1] "name:alvi,age:34"

Related

Many string replacements in R

How can I do a string replace for one column, but multiple conditions.
I have the following data
strings <- as_tibble(c("string.a","string.a", "string.b", "string.c"))
# A tibble: 4 x 1
value
<chr>
1 string_alice
2 string_alice
3 string_bob
4 string_joe
and the following replacements
replacements <- c("alice", "bob", "joe")
conditions <- c(".a", ".b", ".c")
The resulting data would be
result <- as_tibble(c("string_alice", "string_bob", "string_joe"))
# A tibble: 4 x 1
value
<chr>
1 string_alice
2 string_alice
3 string_bob
4 string_joe
I have considered a mapping table of some sort, but it is not clear to me how to feed a mapping table to a string replace function.
nm = setNames(replacements, gsub("\\.", "", conditions))
sapply(strsplit(strings$value, "\\."), function(x){
paste(c(x[1], nm[x[2]]), collapse = ".")
})
Data
strings = structure(list(value = c("string.a", "string.a", "string.b",
"string.c")), row.names = c(NA, -4L), class = c("tbl_df", "tbl",
"data.frame"))
We can use gsubfn
library(gsubfn)
sub("\\.", "_", gsubfn("(\\w+)$", setNames(as.list(replacements),
sub("\\.", "", conditions)), strings$value))
#[1] "string_alice" "string_alice" "string_bob" "string_joe"

Extract subset of string in dataframe column

I have one of the columns in the data frame as follows. Need to get the output as shown.
Data :
NM_001104633|0|Sema3d|-
NM_0011042|0|XYZ|-
NM_0956|0|ghd|+
Required output :
Sema3d
XYZ
ghd
x = c("NM_001104633|0|Sema3d|-", "NM_0011042|0|XYZ|-", "NM_0956|0|ghd|+")
sub(".*0\\|(.*)\\|[+|-]", "\\1", x)
#[1] "Sema3d" "XYZ" "ghd"
#OR
sapply(strsplit(x, "\\|"), function(s) s[3])
#[1] "Sema3d" "XYZ" "ghd"
#OR
sapply(x, function(s){
inds = gregexpr("\\|", s)[[1]]
substring(s, inds[2] + 1, inds[3] - 1)
},
USE.NAMES = FALSE)
#[1] "Sema3d" "XYZ" "ghd"
We can use read.table to separate them in different columns and then select only the one which we are interested in.
read.table(text = df$V1, sep = "|")
# V1 V2 V3 V4
#1 NM_001104633 0 Sema3d -
#2 NM_0011042 0 XYZ -
#3 NM_0956 0 ghd +
We can also use tidyr::separate for this
tidyr::separate(df, V1, into = paste0("col1", 1:4), sep = "\\|")
Or cSplit from splitstackshape
splitstackshape::cSplit(df, "V1", sep = "|")
data
df <- structure(list(V1 = c("NM_001104633|0|Sema3d|-", "NM_0011042|0|XYZ|-",
"NM_0956|0|ghd|+")), class = "data.frame", row.names = c(NA, -3L))
The following regex takes all text between the last pair of | followed by a + or a -.
([^\|]*)(?=\|(\+|-))
Demo
We can use sub from base R
sub(".*\\|(\\w+)\\|[-+]$", "\\1", x)
#[1] "Sema3d" "XYZ" "ghd"
Or using gsub
gsub(".*\\d+\\||\\|.*", "", x)
#[1] "Sema3d" "XYZ" "ghd"
data
x <- c("NM_001104633|0|Sema3d|-", "NM_0011042|0|XYZ|-", "NM_0956|0|ghd|+")
The package unglue offers a readable alternative, if not as efficient :
x = c("NM_001104633|0|Sema3d|-", "NM_0011042|0|XYZ|-", "NM_0956|0|ghd|+")
unglue::unglue_vec(x, "{drop1}|0|{keep}|{drop2}",var = "keep")
#> [1] "Sema3d" "XYZ" "ghd"
# or
unglue::unglue_vec(x, "{=.*?}|0|{keep}|{=.*?}")
#> [1] "Sema3d" "XYZ" "ghd"
Or in the data frame directly :
df <- data.frame(col = x)
unglue::unglue_unnest(df, col, "{=.*?}|0|{new_col}|{=.*?}")
#> new_col
#> 1 Sema3d
#> 2 XYZ
#> 3 ghd

Parse unexpected symbol error in function applied over list

I'm trying to check the "pin" numbers of cases with missing data for each variable of interest in my dataset.
Here are some fake data:
c <- data.frame(pin = c(1, 2, 3, 4), type = c(1, 1, 2, 2), v1 = c(1, NA, NA,
NA), v2 = c(NA, NA, 1, 1))
I wrote a function "m.pin" to do this:
m.pin <- function(x, data = "c", return = "$pin") {
sect <- gsub("^.*\\[", "\\[", deparse(substitute(x)))
vect <- eval(parse(text = paste(data, return, sect, sep = "")))
return(vect[is.na(x)])
}
And I use it like so:
m.pin(c$v1[c$type == 1])
[1] 2
I wrote a function to apply "m.pin" over a list of variables to only return pins with missing data:
return.m.pin <- function(x, fun = m.pin) {
val.list <- lapply(x, fun)
condition <- lapply(val.list, function(x) length(x) > 0)
val.list[unlist(condition)]
}
But when I apply it, I get this error:
l <- lst(c$v1[c$type == 1], c$v2[c$type == 2])
return.m.pin(l)
Error in parse(text = paste(data, return, sect, sep = "")) :
<text>:1:9: unexpected ']'
1: c$pin[i]]
^
How can I rewrite my function(s) to avoid this issue?
Many thanks!
Please see Gregor's comment for the most critical issues with your code (to add: don't use return as a variable name as it is the name of a base R function).
It's not clear to me why you want to define a specific function m.pin, nor what you ultimately are trying to do, but I am assuming this is a critical design component.
Rewriting m.pin as
m.pin <- function(df, type, vcol) which(df[, "type"] == type & is.na(df[, vcol]))
we get
m.pin(df, 1, "v1")
#[1] 2
Or to identify rows with NA in "v1" for all types
lapply(unique(df$type), function(x) m.pin(df, x, "v1"))
#[[1]]
#[1] 2
#
#[[2]]
#[1] 3 4
Update
In response to Gregor's comment, perhaps this is what you're after?
by(df, df$type, function(x)
list(v1 = x$pin[which(is.na(x$v1))], v2 = x$pin[which(is.na(x$v2))]))
# df$type: 1
# $v1
# [1] 2
#
# $v2
# [1] 1 2
#
# ------------------------------------------------------------
# df$type: 2
# $v1
# [1] 3 4
#
# $v2
# integer(0)
This returns a list of the pin numbers for every type and NA entries in v1/v2.
Sample data
df <- data.frame(
pin = c(1, 2, 3, 4),
type = c(1, 1, 2, 2),
v1 = c(1, NA, NA, NA),
v2 = c(NA, NA, 1, 1))
I would suggest rewriting like this (if this approach is to be taken at all). I call your data d because c is already the name of an extremely common function.
# string column names, pass in the data frame as an object
# means no need for eval, parse, substitute, etc.
foo = function(data, na_col, return_col = "pin", filter_col, filter_val) {
if(! missing(filter_col) & ! missing(filter_val)) {
data = data[data[, filter_col] == filter_val, ]
}
data[is.na(data[, na_col]), return_col]
}
# working on the whole data frame
foo(d, na_col = "v1", return_col = "pin")
# [1] 2 3 4
# passing in a subset of the data
foo(d[d$type == 1, ], "v1", "pin")
# [1] 2
# using function arguments to subset the data
foo(d, "v1", "pin", filter_col = "type", filter_val = 1)
# [1] 2
# calling it with changing arguments:
# you could use `Map` or `mapply` to be fancy, but this for loop is nice and clear
inputs = data.frame(na_col = c("v1", "v2"), filter_val = c(1, 2), stringsAsFactors = FALSE)
result = list()
for (i in 1:nrow(inputs)) {
result[[i]] = foo(d, na_col = inputs$na_col[i], return_col = "pin",
filter_col = "type", filter_val = inputs$filter_val[i])
}
result
# [[1]]
# [1] 2
#
# [[2]]
# numeric(0)
A different approach I would suggest is melting your data into a long format, and simply taking a subset of the NA values, hence getting all combinations of type and the v* columns that have NA values at once. Do this once, and no function is needed to look up individual combinations.
d_long = reshape2::melt(d, id.vars = c("pin", "type"))
library(dplyr)
d_long %>% filter(is.na(value)) %>%
arrange(variable, type)
# pin type variable value
# 1 2 1 v1 NA
# 2 3 2 v1 NA
# 3 4 2 v1 NA
# 4 1 1 v2 NA
# 5 2 1 v2 NA

Display a list of matrices without row and column names

I want to display a list of matrices (not a single matrix, as has been asked about elsewhere) without the small [1,] and [,1] row and column indicators.
For example, given myList:
myList <- list(matrix(c(1,2,3,4,5,6), nrow = 2), matrix(c(1,2,3,4,5,6), nrow = 3))
names(myList) <- c("This is the first matrix:", "This is the second matrix:")
I'm looking for some function myFunction() that will output:
> myFunction(myList)
$`This is the first matrix:`
1 3 5
2 4 6
$`This is the second matrix:`
1 4
2 5
3 6
It would be even better if it could eliminate the $... around the list names so that it would display:
This is the first matrix:
1 3 5
2 4 6
This is the second matrix:
1 4
2 5
3 6
After reading all the related questions, I've tried
myList %>% lapply(print, row.names = F)
myList %>% lapply(prmatrix, collab = NULL, rowlab = NULL)
myList %>% lapply(write.table, sep = " ", row.names = F, col.names = F)
But none work as intended.
So you are just missing the headers? How about something like
library(purrr) #for walk2()
print_with_name <- function(mat, name) {
cat(name,"\n")
write.table(mat, sep = " ", row.names = F, col.names = F)
}
myList %>% walk2(., names(.), print_with_name)

aggregate values in dataframe by partly matching rownames in R

I'm thumbling around with the following problem, but to no evail:
d <- data.frame(value = 1:4, row.names = c("abc", "abcd", "ef", "gh"))
value
abc 1
abcd 2
ef 3
gh 4
l <- nrow(d)
wordmat <- matrix(rep(NA, l^2), l, l, dimnames = list(row.names(d), row.names(d)))
for (i in 1:ncol(wordmat)) {
rid <- agrep(colnames(wordmat)[i], rownames(wordmat), max = 0)
d$matchid[i] <- paste(rid, collapse = ";")
}
# desired output:
(d_agg <- data.frame(value = c(3, 3, 4), row.names = c("abc;abcd", "ef", "gh")))
value
abc;abcd 3
ef 3
gh 4
is there a function for this?
Here's a possible solution that you might be able to modify to suit your needs.
Some notes:
I couldn't figure out how to deal with rownames() directly, particularly in the last stage, so this depends on you being happy with copying your row names as a new variable.
The function below "hard-codes" the variable names, functions, and so on. That is to say, it is not by any means a generalized function, but one which might be useful as you look further into this problem.
Here's the function.
matches <- function(data, ...) {
temp = vector("list", nrow(data))
for (i in 1:nrow(data)) {
temp1 = agrep(data$RowNames[i], data$RowNames, value = TRUE, ...)
temp[[i]] = data.frame(RowNames = paste(temp1, collapse = "; "),
value = sum(data[temp1, "value"]))
}
temp = do.call(rbind, temp)
temp[!duplicated(temp$RowNames), ]
}
Note that the function needs a column called RowNames, so we'll create that, and then test the function.
d <- data.frame(value = 1:4, row.names = c("abc", "abcd", "ef", "gh"))
d$RowNames <- rownames(d)
matches(d)
# RowNames value
# 1 abc; abcd 3
# 3 ef 3
# 4 gh 4
matches(d, max.distance = 2)
# RowNames value
# 1 abc; abcd 3
# 3 abc; abcd; ef; gh 10
matches(d, max.distance = 4)
# RowNames value
# 1 abc; abcd; ef; gh 10
This works for your example but may need tweaking for the real thing:
d <- data.frame(value = 1:4, row.names = c("abc", "abcd", "ef", "gh"))
rowclust <- hclust(as.dist(adist(rownames(d))), method="single")
rowgroups <- cutree(rowclust, h=1.5)
rowagg <- aggregate(d, list(rowgroups), sum)
rowname <- unclass(by(rownames(d), rowgroups, paste, collapse=";"))
rownames(rowagg) <- rowname
rowagg
Group.1 value
abc;abcd 1 3
ef 2 3
gh 3 4

Resources