My df
name age
tom 21
mary 42
How can I combine each row to something like
name:tom,age:21
name:mary,age:42
the output can be a list of strings.
A more general approach using apply.
apply(df1, 1, function(x) {n <- names(df1); paste0(n[1],":",x[1],",", n[2],":",x[2], collapse = "")})
here is a super general version:
df1<-
structure(list(name = c("tom", "mary"), age = c(21L, 42L), cool = c("yes",
"no")), row.names = c(NA, -2L), class = "data.frame")
apply(
apply(df1, 1, function(x) {n <- names(df1); paste0(paste(n,x, sep = ":"))}),
2,
paste0, collapse = ","
)
# "name:tom,age:21,cool:yes" "name:mary,age:42,cool:no"
Try with thispaste combination:
df$new.col <- paste(paste(colnames(df)[1], df$name, sep = ":"),
paste(colnames(df)[2], df$age, sep = ":"),
sep = ",")
# output
# name age new.col
#1 tom 21 name:tom,age:21
#2 mary 42 name:mary,age:42
I have some sample data, such as:
name=c("ali","asgar","ahmad","aslam","alvi")
age=c(12,33,23,16,34)
mydf=data.frame(name,age)
Data frame looking is as
> mydf
name age
1 ali 12
2 asgar 33
3 ahmad 23
4 aslam 16
5 alvi 34
Now make a list object and fill it.
mylist=list()
for(i in 1:nrow(mydf))
{
a=as.integer(mydf$age[i])
n=as.String(mydf$name[i])
mylist[i]=paste(paste(paste("name",n,sep = ":"),"age",sep = ","),a,sep = ":")
}
Finall, result is
> mylist
[[1]]
[1] "name:ali,age:12"
[[2]]
[1] "name:asgar,age:33"
[[3]]
[1] "name:ahmad,age:23"
[[4]]
[1] "name:aslam,age:16"
[[5]]
[1] "name:alvi,age:34"
Related
How can I do a string replace for one column, but multiple conditions.
I have the following data
strings <- as_tibble(c("string.a","string.a", "string.b", "string.c"))
# A tibble: 4 x 1
value
<chr>
1 string_alice
2 string_alice
3 string_bob
4 string_joe
and the following replacements
replacements <- c("alice", "bob", "joe")
conditions <- c(".a", ".b", ".c")
The resulting data would be
result <- as_tibble(c("string_alice", "string_bob", "string_joe"))
# A tibble: 4 x 1
value
<chr>
1 string_alice
2 string_alice
3 string_bob
4 string_joe
I have considered a mapping table of some sort, but it is not clear to me how to feed a mapping table to a string replace function.
nm = setNames(replacements, gsub("\\.", "", conditions))
sapply(strsplit(strings$value, "\\."), function(x){
paste(c(x[1], nm[x[2]]), collapse = ".")
})
Data
strings = structure(list(value = c("string.a", "string.a", "string.b",
"string.c")), row.names = c(NA, -4L), class = c("tbl_df", "tbl",
"data.frame"))
We can use gsubfn
library(gsubfn)
sub("\\.", "_", gsubfn("(\\w+)$", setNames(as.list(replacements),
sub("\\.", "", conditions)), strings$value))
#[1] "string_alice" "string_alice" "string_bob" "string_joe"
I have one of the columns in the data frame as follows. Need to get the output as shown.
Data :
NM_001104633|0|Sema3d|-
NM_0011042|0|XYZ|-
NM_0956|0|ghd|+
Required output :
Sema3d
XYZ
ghd
x = c("NM_001104633|0|Sema3d|-", "NM_0011042|0|XYZ|-", "NM_0956|0|ghd|+")
sub(".*0\\|(.*)\\|[+|-]", "\\1", x)
#[1] "Sema3d" "XYZ" "ghd"
#OR
sapply(strsplit(x, "\\|"), function(s) s[3])
#[1] "Sema3d" "XYZ" "ghd"
#OR
sapply(x, function(s){
inds = gregexpr("\\|", s)[[1]]
substring(s, inds[2] + 1, inds[3] - 1)
},
USE.NAMES = FALSE)
#[1] "Sema3d" "XYZ" "ghd"
We can use read.table to separate them in different columns and then select only the one which we are interested in.
read.table(text = df$V1, sep = "|")
# V1 V2 V3 V4
#1 NM_001104633 0 Sema3d -
#2 NM_0011042 0 XYZ -
#3 NM_0956 0 ghd +
We can also use tidyr::separate for this
tidyr::separate(df, V1, into = paste0("col1", 1:4), sep = "\\|")
Or cSplit from splitstackshape
splitstackshape::cSplit(df, "V1", sep = "|")
data
df <- structure(list(V1 = c("NM_001104633|0|Sema3d|-", "NM_0011042|0|XYZ|-",
"NM_0956|0|ghd|+")), class = "data.frame", row.names = c(NA, -3L))
The following regex takes all text between the last pair of | followed by a + or a -.
([^\|]*)(?=\|(\+|-))
Demo
We can use sub from base R
sub(".*\\|(\\w+)\\|[-+]$", "\\1", x)
#[1] "Sema3d" "XYZ" "ghd"
Or using gsub
gsub(".*\\d+\\||\\|.*", "", x)
#[1] "Sema3d" "XYZ" "ghd"
data
x <- c("NM_001104633|0|Sema3d|-", "NM_0011042|0|XYZ|-", "NM_0956|0|ghd|+")
The package unglue offers a readable alternative, if not as efficient :
x = c("NM_001104633|0|Sema3d|-", "NM_0011042|0|XYZ|-", "NM_0956|0|ghd|+")
unglue::unglue_vec(x, "{drop1}|0|{keep}|{drop2}",var = "keep")
#> [1] "Sema3d" "XYZ" "ghd"
# or
unglue::unglue_vec(x, "{=.*?}|0|{keep}|{=.*?}")
#> [1] "Sema3d" "XYZ" "ghd"
Or in the data frame directly :
df <- data.frame(col = x)
unglue::unglue_unnest(df, col, "{=.*?}|0|{new_col}|{=.*?}")
#> new_col
#> 1 Sema3d
#> 2 XYZ
#> 3 ghd
I'm trying to check the "pin" numbers of cases with missing data for each variable of interest in my dataset.
Here are some fake data:
c <- data.frame(pin = c(1, 2, 3, 4), type = c(1, 1, 2, 2), v1 = c(1, NA, NA,
NA), v2 = c(NA, NA, 1, 1))
I wrote a function "m.pin" to do this:
m.pin <- function(x, data = "c", return = "$pin") {
sect <- gsub("^.*\\[", "\\[", deparse(substitute(x)))
vect <- eval(parse(text = paste(data, return, sect, sep = "")))
return(vect[is.na(x)])
}
And I use it like so:
m.pin(c$v1[c$type == 1])
[1] 2
I wrote a function to apply "m.pin" over a list of variables to only return pins with missing data:
return.m.pin <- function(x, fun = m.pin) {
val.list <- lapply(x, fun)
condition <- lapply(val.list, function(x) length(x) > 0)
val.list[unlist(condition)]
}
But when I apply it, I get this error:
l <- lst(c$v1[c$type == 1], c$v2[c$type == 2])
return.m.pin(l)
Error in parse(text = paste(data, return, sect, sep = "")) :
<text>:1:9: unexpected ']'
1: c$pin[i]]
^
How can I rewrite my function(s) to avoid this issue?
Many thanks!
Please see Gregor's comment for the most critical issues with your code (to add: don't use return as a variable name as it is the name of a base R function).
It's not clear to me why you want to define a specific function m.pin, nor what you ultimately are trying to do, but I am assuming this is a critical design component.
Rewriting m.pin as
m.pin <- function(df, type, vcol) which(df[, "type"] == type & is.na(df[, vcol]))
we get
m.pin(df, 1, "v1")
#[1] 2
Or to identify rows with NA in "v1" for all types
lapply(unique(df$type), function(x) m.pin(df, x, "v1"))
#[[1]]
#[1] 2
#
#[[2]]
#[1] 3 4
Update
In response to Gregor's comment, perhaps this is what you're after?
by(df, df$type, function(x)
list(v1 = x$pin[which(is.na(x$v1))], v2 = x$pin[which(is.na(x$v2))]))
# df$type: 1
# $v1
# [1] 2
#
# $v2
# [1] 1 2
#
# ------------------------------------------------------------
# df$type: 2
# $v1
# [1] 3 4
#
# $v2
# integer(0)
This returns a list of the pin numbers for every type and NA entries in v1/v2.
Sample data
df <- data.frame(
pin = c(1, 2, 3, 4),
type = c(1, 1, 2, 2),
v1 = c(1, NA, NA, NA),
v2 = c(NA, NA, 1, 1))
I would suggest rewriting like this (if this approach is to be taken at all). I call your data d because c is already the name of an extremely common function.
# string column names, pass in the data frame as an object
# means no need for eval, parse, substitute, etc.
foo = function(data, na_col, return_col = "pin", filter_col, filter_val) {
if(! missing(filter_col) & ! missing(filter_val)) {
data = data[data[, filter_col] == filter_val, ]
}
data[is.na(data[, na_col]), return_col]
}
# working on the whole data frame
foo(d, na_col = "v1", return_col = "pin")
# [1] 2 3 4
# passing in a subset of the data
foo(d[d$type == 1, ], "v1", "pin")
# [1] 2
# using function arguments to subset the data
foo(d, "v1", "pin", filter_col = "type", filter_val = 1)
# [1] 2
# calling it with changing arguments:
# you could use `Map` or `mapply` to be fancy, but this for loop is nice and clear
inputs = data.frame(na_col = c("v1", "v2"), filter_val = c(1, 2), stringsAsFactors = FALSE)
result = list()
for (i in 1:nrow(inputs)) {
result[[i]] = foo(d, na_col = inputs$na_col[i], return_col = "pin",
filter_col = "type", filter_val = inputs$filter_val[i])
}
result
# [[1]]
# [1] 2
#
# [[2]]
# numeric(0)
A different approach I would suggest is melting your data into a long format, and simply taking a subset of the NA values, hence getting all combinations of type and the v* columns that have NA values at once. Do this once, and no function is needed to look up individual combinations.
d_long = reshape2::melt(d, id.vars = c("pin", "type"))
library(dplyr)
d_long %>% filter(is.na(value)) %>%
arrange(variable, type)
# pin type variable value
# 1 2 1 v1 NA
# 2 3 2 v1 NA
# 3 4 2 v1 NA
# 4 1 1 v2 NA
# 5 2 1 v2 NA
I want to display a list of matrices (not a single matrix, as has been asked about elsewhere) without the small [1,] and [,1] row and column indicators.
For example, given myList:
myList <- list(matrix(c(1,2,3,4,5,6), nrow = 2), matrix(c(1,2,3,4,5,6), nrow = 3))
names(myList) <- c("This is the first matrix:", "This is the second matrix:")
I'm looking for some function myFunction() that will output:
> myFunction(myList)
$`This is the first matrix:`
1 3 5
2 4 6
$`This is the second matrix:`
1 4
2 5
3 6
It would be even better if it could eliminate the $... around the list names so that it would display:
This is the first matrix:
1 3 5
2 4 6
This is the second matrix:
1 4
2 5
3 6
After reading all the related questions, I've tried
myList %>% lapply(print, row.names = F)
myList %>% lapply(prmatrix, collab = NULL, rowlab = NULL)
myList %>% lapply(write.table, sep = " ", row.names = F, col.names = F)
But none work as intended.
So you are just missing the headers? How about something like
library(purrr) #for walk2()
print_with_name <- function(mat, name) {
cat(name,"\n")
write.table(mat, sep = " ", row.names = F, col.names = F)
}
myList %>% walk2(., names(.), print_with_name)
I'm thumbling around with the following problem, but to no evail:
d <- data.frame(value = 1:4, row.names = c("abc", "abcd", "ef", "gh"))
value
abc 1
abcd 2
ef 3
gh 4
l <- nrow(d)
wordmat <- matrix(rep(NA, l^2), l, l, dimnames = list(row.names(d), row.names(d)))
for (i in 1:ncol(wordmat)) {
rid <- agrep(colnames(wordmat)[i], rownames(wordmat), max = 0)
d$matchid[i] <- paste(rid, collapse = ";")
}
# desired output:
(d_agg <- data.frame(value = c(3, 3, 4), row.names = c("abc;abcd", "ef", "gh")))
value
abc;abcd 3
ef 3
gh 4
is there a function for this?
Here's a possible solution that you might be able to modify to suit your needs.
Some notes:
I couldn't figure out how to deal with rownames() directly, particularly in the last stage, so this depends on you being happy with copying your row names as a new variable.
The function below "hard-codes" the variable names, functions, and so on. That is to say, it is not by any means a generalized function, but one which might be useful as you look further into this problem.
Here's the function.
matches <- function(data, ...) {
temp = vector("list", nrow(data))
for (i in 1:nrow(data)) {
temp1 = agrep(data$RowNames[i], data$RowNames, value = TRUE, ...)
temp[[i]] = data.frame(RowNames = paste(temp1, collapse = "; "),
value = sum(data[temp1, "value"]))
}
temp = do.call(rbind, temp)
temp[!duplicated(temp$RowNames), ]
}
Note that the function needs a column called RowNames, so we'll create that, and then test the function.
d <- data.frame(value = 1:4, row.names = c("abc", "abcd", "ef", "gh"))
d$RowNames <- rownames(d)
matches(d)
# RowNames value
# 1 abc; abcd 3
# 3 ef 3
# 4 gh 4
matches(d, max.distance = 2)
# RowNames value
# 1 abc; abcd 3
# 3 abc; abcd; ef; gh 10
matches(d, max.distance = 4)
# RowNames value
# 1 abc; abcd; ef; gh 10
This works for your example but may need tweaking for the real thing:
d <- data.frame(value = 1:4, row.names = c("abc", "abcd", "ef", "gh"))
rowclust <- hclust(as.dist(adist(rownames(d))), method="single")
rowgroups <- cutree(rowclust, h=1.5)
rowagg <- aggregate(d, list(rowgroups), sum)
rowname <- unclass(by(rownames(d), rowgroups, paste, collapse=";"))
rownames(rowagg) <- rowname
rowagg
Group.1 value
abc;abcd 1 3
ef 2 3
gh 3 4