Converting a list into a data frame with specific titles - r

I have a list like this:
$20
[1] 500
$30
[2] 600
I want to convert this into a dataframe like this
id values
20 500
30 600

You can do:
L <- list(`20`=500,`30`=600)
df <- data.frame(id=names(L), values=sapply(L, function(x) x[1]))
# > df
# id values
# 20 20 500
# 30 30 600
or a bit more tricky:
df <- data.frame(id=names(L), values=sapply(L, '[', 1))
Till now I was thinking about longer vectors (and take only the first element). But in your case (if each element of the list is only a 1-element vector) a shorter solution (thx to Abdou for the comment) is:
df <- data.frame(id = names(L), values = unlist(L))

You can use do.call to solve your problem:
li <- list(`20`=500,`30`=600)
df <- data.frame(Values = do.call("rbind",li))
df$Id <- rownames(df)
rownames(df) <- NULL
df <- df[,c(2,1)]
df
Output:
> df
Id Values
1 20 500
2 30 600

purrr's *_df functions iterate a function over a list and simplify to a data.frame. With the development version, you can use the new imap variant that uses the names or indices as a second variable .y:
library(purrr)
l <- list(`25` = 900, `26` = 500)
l %>% imap_dfr(~data.frame(id = as.integer(.y),
value = .x))
#> id value
#> 1 25 900
#> 2 26 500
or with CRAN purrr, you can pass the names as the second variable to map2:
l %>% map2_df(names(.),
~data.frame(id = as.integer(.y),
value = .x))
#> id value
#> 1 25 900
#> 2 26 500

use unlist function.
L = list(`20`=500,`30`=600)
df = unlist(L)
It returns vector. If you want data.frame:
df = as.data.frame(t(unlist(L)))
Output:
> df
20 30
1 500 600

Here's a solution with Map
l <- list(`20`=500,`30`=600)
do.call(rbind,Map(data.frame,id=names(l),values=l))
id values
20 20 500
30 30 600

Easy way to achieve the same by the use of melt function from reshape2 package.
library(reshape2)
l = list('20'=500, '30'=600)
melt(as.data.frame(l, check.names = F))
Output:
variable value
1 20 500
2 30 600
Alternate approach without using any package
ls = list('20' = 500, '30' = 600, '40' = 400)
d = data.frame('id' = row.names(as.array(unlist(ls))), 'value' = unlist(ls),row.names = 1:length(ls))
Output
id value
1 20 500
2 30 600
3 40 700
4 50 800

Related

Selecting number from string based on criteria

I have the following data set:
PATH = c("5-8-10-8-17-20",
"56-85-89-89-0-15-88-10",
"58-85-89-65-49-51")
INDX = c(18, 89, 50)
data.frame(PATH, INDX)
PATH
INDX
5-8-10-8-17-20
18
56-85-89-89-0-15-88-10
89
58-85-89-65-49-51
50
The column PATH has strings that represent a numerical series and I want to be able to pick the largest number from the string that satisfies PATH <= INDX, that is selecting a number from PATH that is equal to INDX or the largest number from PATH that is yet less than INDX
my desired output would look like this:
PATH
INDX
PICK
5-8-10-8-17-20
18
17
56-85-89-89-0-15-88-10
89
88
58-85-89-65-49-51
50
49
Some of my thought-process behind the answer:
I know that If I have a function such strsplit I could separate each string by "-", arrange by number and then subtract with INDX and thus select the smallest negative number or zero. However, the original dataset is quite large and I wonder if there is a faster or more efficient way to perform this task.
Another option:
mapply(
\(x, y) max(x[x <= y]),
strsplit(PATH, "-") |> lapply(as.integer),
INDX
)
# [1] 17 88 49
Using purrr::map2_dbl():
library(purrr)
PICK <- map2_dbl(
strsplit(PATH, "-"),
INDX,
~ max(
as.numeric(.x)[as.numeric(.x) <= .y]
)
)
# 17 89 49
The below should be reasonably efficient, there is nothing wrong with your approach.
numpath <- sapply(strsplit(PATH, "-"), as.numeric)
maxindexes <- lapply(1:length(numpath), function(x) which(numpath[[x]] <= INDX[x]))
result <- sapply(1:length(numpath), function(x) max(numpath[[x]][maxindexes[[x]]]))
> result
[1] 17 89 49
Using dplyr
library(dplyr)
df |>
rowwise() |>
mutate(across(PATH, ~ {
a = unlist(strsplit(.x, split = "-"))
max(as.numeric(a)[which(as.numeric(a) <= INDX)])
}, .names = "PICK"))
PATH INDX PICK
<chr> <dbl> <dbl>
1 5-8-10-8-17-20 18 17
2 56-85-89-89-0-15-88-10 89 89
3 58-85-89-65-49-51 50 49
You can create a custom function like below:
my_func <- function(vec1, vec2) {
sort(as.numeric(unlist(strsplit(vec1, split = "-")))) -> x
return(x[max(cumsum(x <= vec2))])
}
df$PICK <- sapply(seq_len(nrow(df)), function(i) my_func(df$PATH[i], df$INDX[i]))
which will yield the following output:
# PATH INDX PICK
# 1 5-8-10-8-17-20 18 17
# 2 56-85-89-89-0-15-88-10 89 89
# 3 58-85-89-65-49-51 50 49

How to convert data in R

I have CSV data, for example, A+12, A+13, A+14 (those are chr)
what kind of function I should use when I change those data to just numbers?
like A+12 -> 12
A+13 -> 13
A+14 -> 14
You can use parse_number from readr package:
> library(readr)
> dat <- data.frame(col1 = c('A+12','A+13','A+14'), stringsAsFactors = F)
> dat
col1
1 A+12
2 A+13
3 A+14
> dat$number <- parse_number(dat$col1)
> dat
col1 number
1 A+12 12
2 A+13 13
3 A+14 14
Using base R
> gsub('(.*)(\\d\\d+$)','\\2', dat$col1)
[1] "12" "13" "14"
>
Adding your scenario:
> dat <- data.frame(col1 = c('A+12', 'B+51', 'A+36', 'B+55', 'B+3' ,'A+31'), stringsAsFactors = F)
> dat
col1
1 A+12
2 B+51
3 A+36
4 B+55
5 B+3
6 A+31
> dat$number <- parse_number(dat$col1)
> dat %>% mutate(number = case_when(substr(col1, 1,1 ) == 'A' ~ number,
+ substr(col1, 1,1 ) == 'B' ~ number * -1))
col1 number
1 A+12 12
2 B+51 -51
3 A+36 36
4 B+55 -55
5 B+3 -3
6 A+31 31
>
substr(x,a,b) takes any x object with strings, and "cut" them on their a'th and b'th letter positions, so you can do substr("A+12",3,4) to get only "12", and then turn it to numeric with as.numeric():
as.numeric(substr("A+12",3,4))
Edit1: As the number of letter each string varies, we'll need to change the code. say your data frame is called df, with the column you described is old:
new = numeric() # Create an empty vector
for(i in 1:nrow(df)){
new[i] = as.numeric(substr(df$old[i], 3, nchar(df$old[i])))}
# Get from the 3rd to the last letter of string
df$new = c(new[1:100], -new[101:200])
OBS: if you wanted to make this code more general, you could do:
df$new = c(new[grep("A", df$old)], -new[grep("B", df$old)])
As grep returns the indexes of where the patter "A"/"B" was matched in df$old.
Edit2: The nice function that Karthik S showed can simplify a lot the code:
new = readr::parse_number(df$old)
df$new = c(new[1:100], -new[101:200])
Or
df$new = c(new[grep("A", df$old)], -new[grep("B", df$old)])

How to count unique values from vector? [duplicate]

Let's say I have:
v = rep(c(1,2, 2, 2), 25)
Now, I want to count the number of times each unique value appears. unique(v) returns what the unique values are, but not how many they are.
> unique(v)
[1] 1 2
I want something that gives me
length(v[v==1])
[1] 25
length(v[v==2])
[1] 75
but as a more general one-liner :) Something close (but not quite) like this:
#<doesn't work right> length(v[v==unique(v)])
Perhaps table is what you are after?
dummyData = rep(c(1,2, 2, 2), 25)
table(dummyData)
# dummyData
# 1 2
# 25 75
## or another presentation of the same data
as.data.frame(table(dummyData))
# dummyData Freq
# 1 1 25
# 2 2 75
If you have multiple factors (= a multi-dimensional data frame), you can use the dplyr package to count unique values in each combination of factors:
library("dplyr")
data %>% group_by(factor1, factor2) %>% summarize(count=n())
It uses the pipe operator %>% to chain method calls on the data frame data.
It is a one-line approach by using aggregate.
> aggregate(data.frame(count = v), list(value = v), length)
value count
1 1 25
2 2 75
length(unique(df$col)) is the most simple way I can see.
table() function is a good way to go, as Chase suggested.
If you are analyzing a large dataset, an alternative way is to use .N function in datatable package.
Make sure you installed the data table package by
install.packages("data.table")
Code:
# Import the data.table package
library(data.table)
# Generate a data table object, which draws a number 10^7 times
# from 1 to 10 with replacement
DT<-data.table(x=sample(1:10,1E7,TRUE))
# Count Frequency of each factor level
DT[,.N,by=x]
To get an un-dimensioned integer vector that contains the count of unique values, use c().
dummyData = rep(c(1, 2, 2, 2), 25) # Chase's reproducible data
c(table(dummyData)) # get un-dimensioned integer vector
1 2
25 75
str(c(table(dummyData)) ) # confirm structure
Named int [1:2] 25 75
- attr(*, "names")= chr [1:2] "1" "2"
This may be useful if you need to feed the counts of unique values into another function, and is shorter and more idiomatic than the t(as.data.frame(table(dummyData))[,2] posted in a comment to Chase's answer. Thanks to Ricardo Saporta who pointed this out to me here.
This works for me. Take your vector v
length(summary(as.factor(v),maxsum=50000))
Comment: set maxsum to be large enough to capture the number of unique values
or with the magrittr package
v %>% as.factor %>% summary(maxsum=50000) %>% length
Also making the values categorical and calling summary() would work.
> v = rep(as.factor(c(1,2, 2, 2)), 25)
> summary(v)
1 2
25 75
You can try also a tidyverse
library(tidyverse)
dummyData %>%
as.tibble() %>%
count(value)
# A tibble: 2 x 2
value n
<dbl> <int>
1 1 25
2 2 75
If you need to have the number of unique values as an additional column in the data frame containing your values (a column which may represent sample size for example), plyr provides a neat way:
data_frame <- data.frame(v = rep(c(1,2, 2, 2), 25))
library("plyr")
data_frame <- ddply(data_frame, .(v), transform, n = length(v))
You can also try dplyr::count
df <- tibble(x=c('a','b','b','c','c','d'), y=1:6)
dplyr::count(df, x, sort = TRUE)
# A tibble: 4 x 2
x n
<chr> <int>
1 b 2
2 c 2
3 a 1
4 d 1
If you want to run unique on a data.frame (e.g., train.data), and also get the counts (which can be used as the weight in classifiers), you can do the following:
unique.count = function(train.data, all.numeric=FALSE) {
# first convert each row in the data.frame to a string
train.data.str = apply(train.data, 1, function(x) paste(x, collapse=','))
# use table to index and count the strings
train.data.str.t = table(train.data.str)
# get the unique data string from the row.names
train.data.str.uniq = row.names(train.data.str.t)
weight = as.numeric(train.data.str.t)
# convert the unique data string to data.frame
if (all.numeric) {
train.data.uniq = as.data.frame(t(apply(cbind(train.data.str.uniq), 1,
function(x) as.numeric(unlist(strsplit(x, split=","))))))
} else {
train.data.uniq = as.data.frame(t(apply(cbind(train.data.str.uniq), 1,
function(x) unlist(strsplit(x, split=",")))))
}
names(train.data.uniq) = names(train.data)
list(data=train.data.uniq, weight=weight)
}
I know there are many other answers, but here is another way to do it using the sort and rle functions. The function rle stands for Run Length Encoding. It can be used for counts of runs of numbers (see the R man docs on rle), but can also be applied here.
test.data = rep(c(1, 2, 2, 2), 25)
rle(sort(test.data))
## Run Length Encoding
## lengths: int [1:2] 25 75
## values : num [1:2] 1 2
If you capture the result, you can access the lengths and values as follows:
## rle returns a list with two items.
result.counts <- rle(sort(test.data))
result.counts$lengths
## [1] 25 75
result.counts$values
## [1] 1 2
count_unique_words <-function(wlist) {
ucountlist = list()
unamelist = c()
for (i in wlist)
{
if (is.element(i, unamelist))
ucountlist[[i]] <- ucountlist[[i]] +1
else
{
listlen <- length(ucountlist)
ucountlist[[i]] <- 1
unamelist <- c(unamelist, i)
}
}
ucountlist
}
expt_counts <- count_unique_words(population)
for(i in names(expt_counts))
cat(i, expt_counts[[i]], "\n")

Find top deciles from dataframe by group

I am attempting to create new variables using a function and lapply rather than working right in the data with loops. I used to use Stata and would have solved this problem with a method similar to that discussed here.
Since naming variables programmatically is so difficult or at least awkward in R (and it seems you can't use indexing with assign), I have left the naming process until after the lapply. I am then using a for loop to do the renaming prior to merging and again for the merging. Are there more efficient ways of doing this? How would I replace the loops? Should I be doing some sort of reshaping?
#Reproducible data
data <- data.frame("custID" = c(1:10, 1:20),
"v1" = rep(c("A", "B"), c(10,20)),
"v2" = c(30:21, 20:19, 1:3, 20:6), stringsAsFactors = TRUE)
#Function to analyze customer distribution for each category (v1)
pf <- function(cat, df) {
df <- df[df$v1 == cat,]
df <- df[order(-df$v2),]
#Divide the customers into top percents
nr <- nrow(df)
p10 <- round(nr * .10, 0)
cat("Number of people in the Top 10% :", p10, "\n")
p20 <- round(nr * .20, 0)
p11_20 <- p20-p10
cat("Number of people in the 11-20% :", p11_20, "\n")
#Keep only those customers in the top groups
df <- df[1:p20,]
#Create a variable to identify the percent group the customer is in
top_pct <- integer(length = p10 + p11_20)
#Identify those in each group
top_pct[1:p10] <- 10
top_pct[(p10+1):p20] <- 20
#Add this variable to the data frame
df$top_pct <- top_pct
#Keep only custID and the new variable
df <- subset(df, select = c(custID, top_pct))
return(df)
}
##Run the customer distribution function
v1Levels <- levels(data$v1)
res <- lapply(v1Levels, pf, df = data)
#Explore the results
summary(res)
# Length Class Mode
# [1,] 2 data.frame list
# [2,] 2 data.frame list
print(res)
# [[1]]
# custID top_pct
# 1 1 10
# 2 2 20
#
# [[2]]
# custID top_pct
# 11 1 10
# 16 6 10
# 12 2 20
# 17 7 20
##Merge the two data frames but with top_pct as a different variable for each category
#Change the new variable name
for(i in 1:length(res)) {
names(res[[i]])[2] <- paste0(v1Levels[i], "_top_pct")
}
#Merge the results
res_m <- res[[1]]
for(i in 2:length(res)) {
res_m <- merge(res_m, res[[i]], by = "custID", all = TRUE)
}
print(res_m)
# custID A_top_pct B_top_pct
# 1 1 10 10
# 2 2 20 20
# 3 6 NA 10
# 4 7 NA 20
Stick to your Stata instincts and use a single data set:
require(data.table)
DT <- data.table(data)
DT[,r:=rank(v2)/.N,by=v1]
You can see the result by typing DT.
From here, you can group the within-v1 rank, r, if you want to. Following Stata idioms...
DT[,g:={
x = rep(0,.N)
x[r>.8] = 20
x[r>.9] = 10
x
}]
This is like gen and then two replace ... if statements. Again, you can see the result with DT.
Finally, you can subset with
DT[g>0]
which gives
custID v1 v2 r g
1: 1 A 30 1.000 10
2: 2 A 29 0.900 20
3: 1 B 20 0.975 10
4: 2 B 19 0.875 20
5: 6 B 20 0.975 10
6: 7 B 19 0.875 20
These steps can also be chained together:
DT[,r:=rank(v2)/.N,by=v1][,g:={x = rep(0,.N);x[r>.8] = 20;x[r>.9] = 10;x}][g>0]
(Thanks to #ExperimenteR:)
To rearrange for the desired output in the OP, with values of v1 in columns, use dcast:
dcast(
DT[,r:=rank(v2)/.N,by=v1][,g:={x = rep(0,.N);x[r>.8] = 20;x[r>.9] = 10;x}][g>0],
custID~v1)
Currently, dcast requires the latest version of data.table, available (I think) from Github.
You don't need the function pf to achieve what you want. Try dplyr/tidyr combo
library(dplyr)
library(tidyr)
data %>%
group_by(v1) %>%
arrange(desc(v2))%>%
mutate(n=n()) %>%
filter(row_number() <= round(n * .2)) %>%
mutate(top_pct= ifelse(row_number()<=round(n* .1), 10, 20)) %>%
select(custID, top_pct) %>%
spread(v1, top_pct)
# custID A B
#1 1 10 10
#2 2 20 20
#3 6 NA 10
#4 7 NA 20
The idiomatic way to do this kind of thing in R would be to use a combination of split and lapply. You're halfway there with your use of lapply; you just need to use split as well.
lapply(split(data, data$v1), function(df) {
cutoff <- quantile(df$v2, c(0.8, 0.9))
top_pct <- ifelse(df$v2 > cutoff[2], 10, ifelse(df$v2 > cutoff[1], 20, NA))
na.omit(data.frame(id=df$custID, top_pct))
})
Finding quantiles is done with quantile.

Count number of occurences for each unique value

Let's say I have:
v = rep(c(1,2, 2, 2), 25)
Now, I want to count the number of times each unique value appears. unique(v) returns what the unique values are, but not how many they are.
> unique(v)
[1] 1 2
I want something that gives me
length(v[v==1])
[1] 25
length(v[v==2])
[1] 75
but as a more general one-liner :) Something close (but not quite) like this:
#<doesn't work right> length(v[v==unique(v)])
Perhaps table is what you are after?
dummyData = rep(c(1,2, 2, 2), 25)
table(dummyData)
# dummyData
# 1 2
# 25 75
## or another presentation of the same data
as.data.frame(table(dummyData))
# dummyData Freq
# 1 1 25
# 2 2 75
If you have multiple factors (= a multi-dimensional data frame), you can use the dplyr package to count unique values in each combination of factors:
library("dplyr")
data %>% group_by(factor1, factor2) %>% summarize(count=n())
It uses the pipe operator %>% to chain method calls on the data frame data.
It is a one-line approach by using aggregate.
> aggregate(data.frame(count = v), list(value = v), length)
value count
1 1 25
2 2 75
length(unique(df$col)) is the most simple way I can see.
table() function is a good way to go, as Chase suggested.
If you are analyzing a large dataset, an alternative way is to use .N function in datatable package.
Make sure you installed the data table package by
install.packages("data.table")
Code:
# Import the data.table package
library(data.table)
# Generate a data table object, which draws a number 10^7 times
# from 1 to 10 with replacement
DT<-data.table(x=sample(1:10,1E7,TRUE))
# Count Frequency of each factor level
DT[,.N,by=x]
To get an un-dimensioned integer vector that contains the count of unique values, use c().
dummyData = rep(c(1, 2, 2, 2), 25) # Chase's reproducible data
c(table(dummyData)) # get un-dimensioned integer vector
1 2
25 75
str(c(table(dummyData)) ) # confirm structure
Named int [1:2] 25 75
- attr(*, "names")= chr [1:2] "1" "2"
This may be useful if you need to feed the counts of unique values into another function, and is shorter and more idiomatic than the t(as.data.frame(table(dummyData))[,2] posted in a comment to Chase's answer. Thanks to Ricardo Saporta who pointed this out to me here.
This works for me. Take your vector v
length(summary(as.factor(v),maxsum=50000))
Comment: set maxsum to be large enough to capture the number of unique values
or with the magrittr package
v %>% as.factor %>% summary(maxsum=50000) %>% length
Also making the values categorical and calling summary() would work.
> v = rep(as.factor(c(1,2, 2, 2)), 25)
> summary(v)
1 2
25 75
You can try also a tidyverse
library(tidyverse)
dummyData %>%
as.tibble() %>%
count(value)
# A tibble: 2 x 2
value n
<dbl> <int>
1 1 25
2 2 75
If you need to have the number of unique values as an additional column in the data frame containing your values (a column which may represent sample size for example), plyr provides a neat way:
data_frame <- data.frame(v = rep(c(1,2, 2, 2), 25))
library("plyr")
data_frame <- ddply(data_frame, .(v), transform, n = length(v))
You can also try dplyr::count
df <- tibble(x=c('a','b','b','c','c','d'), y=1:6)
dplyr::count(df, x, sort = TRUE)
# A tibble: 4 x 2
x n
<chr> <int>
1 b 2
2 c 2
3 a 1
4 d 1
If you want to run unique on a data.frame (e.g., train.data), and also get the counts (which can be used as the weight in classifiers), you can do the following:
unique.count = function(train.data, all.numeric=FALSE) {
# first convert each row in the data.frame to a string
train.data.str = apply(train.data, 1, function(x) paste(x, collapse=','))
# use table to index and count the strings
train.data.str.t = table(train.data.str)
# get the unique data string from the row.names
train.data.str.uniq = row.names(train.data.str.t)
weight = as.numeric(train.data.str.t)
# convert the unique data string to data.frame
if (all.numeric) {
train.data.uniq = as.data.frame(t(apply(cbind(train.data.str.uniq), 1,
function(x) as.numeric(unlist(strsplit(x, split=","))))))
} else {
train.data.uniq = as.data.frame(t(apply(cbind(train.data.str.uniq), 1,
function(x) unlist(strsplit(x, split=",")))))
}
names(train.data.uniq) = names(train.data)
list(data=train.data.uniq, weight=weight)
}
I know there are many other answers, but here is another way to do it using the sort and rle functions. The function rle stands for Run Length Encoding. It can be used for counts of runs of numbers (see the R man docs on rle), but can also be applied here.
test.data = rep(c(1, 2, 2, 2), 25)
rle(sort(test.data))
## Run Length Encoding
## lengths: int [1:2] 25 75
## values : num [1:2] 1 2
If you capture the result, you can access the lengths and values as follows:
## rle returns a list with two items.
result.counts <- rle(sort(test.data))
result.counts$lengths
## [1] 25 75
result.counts$values
## [1] 1 2
count_unique_words <-function(wlist) {
ucountlist = list()
unamelist = c()
for (i in wlist)
{
if (is.element(i, unamelist))
ucountlist[[i]] <- ucountlist[[i]] +1
else
{
listlen <- length(ucountlist)
ucountlist[[i]] <- 1
unamelist <- c(unamelist, i)
}
}
ucountlist
}
expt_counts <- count_unique_words(population)
for(i in names(expt_counts))
cat(i, expt_counts[[i]], "\n")

Resources