Convert data frame columns to vectors - r

I have a dataframe named "Continents_tmap" where I want to return 3 vectors as per the following examples. Note:the "Values" needs to be Cases as per name in dataframe
labels = c("France","Germany","India", etc)
Parent = c("Europe","Europe","Asia",etc)
Values = c(100,345,456,etc)
My current code is as follows.
covid_1 <-
read.csv("C:/Users/Owner/Downloads/COVID-19 Activity.csv", stringsAsFactors = FALSE)
df1 <-
select(
covid_1,
REPORT_DATE,
COUNTRY_SHORT_NAME,
COUNTRY_ALPHA_3_CODE,
PEOPLE_DEATH_NEW_COUNT,
PEOPLE_POSITIVE_NEW_CASES_COUNT,
PEOPLE_DEATH_COUNT,
PEOPLE_POSITIVE_CASES_COUNT,
CONTINENT_NAME
)
Continents_tmap <- df1 %>%
group_by(Continent,Country.x) %>%
summarise(Deaths = sum(Deaths), Cases = sum(Positive_Cases))
Continents_tmap<- data.frame(Continents_tmap)

Related

Mutating a count of rows per group matching a subset condition

I wish to mutate a new column called SF_COUNT which is a count per group (ID) of the number of rows per group where the column type contains 'SF'
A reproducible example looks as follows:
df <- data.frame(ID = c(1234,1234,1234,4567,4567,4567,4567,8900,8900,8900),type = c('RF','SF','SF','RF','SF','SF','SF','RF','SF','SF'))
My final data frame looks like:
final_df <- data.frame(ID = c(1234,1234,1234,4567,4567,4567,4567,8900,8900,8900),type = c('RF','SF','SF','RF','SF','SF','SF','RF','SF','SF'), SF_COUNT = c(2,2,2,3,3,3,3,2,2,2))
How can I achieve this in dplyr please?
After grouping by 'ID', get the sum of logical vector (type == 'SF') in mutate to create the new column
library(dplyr)
df <- df %>%
group_by(ID) %>%
mutate(SF_COUNT = sum(type == 'SF', na.rm = TRUE))
If it is a substring, then use str_detect
library(stringr)
df <- df %>%
group_by(ID) %>%
mutate(SF_COUNT = sum(str_detect(type, 'SF'), na.rm = TRUE))

mutate data frame to check if data duplicate between two data frame

I have two data frame, this is just a sample , database have approx 1 million of records.
can have name, email, alphanumeric code etc.
data1<-data.frame(
'ID 1' = c(86364,"ARV_2612","AGH_2212","IND_2622","CHG_2622"),
sector = c(3,3,1,2,5),
name=c("nhug","hugy","mjuk","ghtr","kuld"),
'Enternal code'=c(1,1,1,1,3),
col3=c(1,1,0,0,0),
col4=c(1,0,0,0,0),
col5=c(1,0,1,1,1)
)
data2<-data.frame(
'ID 1' = c(53265,"ARV_7362",76354,"IND_2622","CHG_9762"),
sector = c(3,3,1,2,5),
name=c("nhug","hugy","mjuk","ghtr","kuld"),
'Enternal code'=c(1,1,1,1,3),
col3=c(1,1,0,0,0),
col4=c(1,0,0,0,0),
col5=c(1,0,1,1,1)
)
data2 %>% mutate(
duplicated = factor(if_else(`ID 1` %in%
pull(data1, `ID 1`),
1,
0)))
now i am looking for a function to mutate my one data frame (data2) like. if I give column names of data1 and data2 to find if the values or string already exist in other data and mutate a new column to 1,0 for true and false.
the function would be like
func(data1 = "name",data2="name",mutated_com="name_exist")
In base R, you can write this function as :
func <- function(data1, data2, data1col, data2col, newcol) {
data2[[newcol]] <- factor(as.integer(data2[[data2col]] %in% data1[[data1col]]))
data2
}
and can call it as :
func(data1, data2, 'name', 'name', 'duplicate')
This will create a column named duplicate in data2 giving 1 where the name in df2 is also present in name of df1 and 0 otherwise.
Using dplyr syntax the above can be written as :
library(dplyr)
library(rlang)
func <- function(data1, data2, data1col, data2col, newcol) {
data2 %>%
mutate(!!newcol := factor(as.integer(.data[[data2col]] %in%
data1[[data1col]])))
}
You can use an inner_join (from dplyr) to determine the overlap between the two dataframes. To use all columns (if both dataframes have the same column names) you do not have to specify the 'by' argument.
You can then add a column 'duplicated' and join back to the original dataframe (df1 or df2) to get the desired result.
overlap <- data1 %>%
inner_join(data2) %>%
mutate(duplicated = 1)
data1 %>% #or data2
left_join(overlap) %>%
mutate(duplicated = ifelse(is.na(duplicated),0,1))

Calculate mean for each row across a list of dataframes in R

I want to calculate the mean of each row in the 'value' column across a list of dataframes.
The output I want is a dataframe with the 'sample' name and its associated mean 'value' for each row across the whole list.
Below is some example data:
list <- list()
dataframe1 <- data.frame(sample = c("OP2645ii_c","OP5048___e","OP5048___f","OP5046___d","OP2645ii_e","OP2645ii_a","OP5054DNAa","OP5048___c","OP2645ii_d","OP5048___b","OP5047___a","OP5048___h","OP5053DNAb","OP3088i__a","OP5048___g","OP5053DNAa","OP5049___a","OP2645ii_b","OP5046___c","OP5044___c","OP2413iiia","OP5054DNAc","OP5046___e","OP5054DNAb","OP5044___a","OP5046___a","OP5046___b","OP2413iiib","OP5051DNAa","OP5048___d","OP5044___b","OP5049___b","OP5051DNAc","OP5051DNAb","OP5053DNAc","OP5047___b","OP5043___b","OP5043___a","OP5052DNAa"),
gr = c("1","1","2","5","4","5","5","3","2","2","2","4","3","1","1","3","2","1","2","5","5","5","2","2","2","1","2","1","1","1","2","1","1","2","2","5","3","3","5"),
value = c("14.32500","14.32500","14.32500","14.32500","14.32500","14.32500","14.32500","14.32500","14.32500","14.32500","14.32500","14.32500","14.32500","14.32500","14.32500","14.32500","14.32500","14.32500","14.32500","14.32500","14.32500","14.32500","14.32500","14.32500","14.32500","14.32500","14.32500","14.32500","14.32500","14.32500","14.32500","14.32500","14.32500","14.32500","14.32500","14.32500","14.32500","14.32500","14.32500"))
list[[1]] <- dataframe1
dataframe2 <- data.frame(sample = c("OP2645ii_c","OP5048___e","OP5048___f","OP5046___d","OP2645ii_e","OP2645ii_a","OP5054DNAa","OP5048___c","OP2645ii_d","OP5048___b","OP5047___a","OP5048___h","OP5053DNAb","OP3088i__a","OP5048___g","OP5053DNAa","OP5049___a","OP2645ii_b","OP5046___c","OP5044___c","OP2413iiia","OP5054DNAc","OP5046___e","OP5054DNAb","OP5044___a","OP5046___a","OP5046___b","OP2413iiib","OP5051DNAa","OP5048___d","OP5044___b","OP5049___b","OP5051DNAc","OP5051DNAb","OP5053DNAc","OP5047___b","OP5043___b","OP5043___a","OP5052DNAa"),
gr = c("5","4","3","5","4","5","5","3","2","2","2","2","3","1","1","3","2","1","2","5","5","5","2","2","2","1","2","1","1","1","2","1","1","2","4","4","4","4","4"),
value = c("12.59000","12.59000","12.59000","12.59000","12.59000","12.59000","12.59000","12.59000","12.59000","12.59000","12.59000","12.59000","12.59000","12.59000","12.59000","12.59000","12.59000","12.59000","12.59000","12.59000","12.59000","12.59000","12.59000","12.59000","12.59000","12.59000","12.59000","12.59000","12.59000","12.59000","12.59000","12.59000","12.59000","12.59000","12.59000","12.59000","12.59000","12.59000","12.59000"))
list[[2]] <- dataframe2
dataframe3 <- data.frame(sample = c("OP2645ii_c","OP5048___e","OP5048___f","OP5046___d","OP2645ii_e","OP2645ii_a","OP5054DNAa","OP5048___c","OP2645ii_d","OP5048___b","OP5047___a","OP5048___h","OP5053DNAb","OP3088i__a","OP5048___g","OP5053DNAa","OP5049___a","OP2645ii_b","OP5046___c","OP5044___c","OP2413iiia","OP5054DNAc","OP5046___e","OP5054DNAb","OP5044___a","OP5046___a","OP5046___b","OP2413iiib","OP5051DNAa","OP5048___d","OP5044___b","OP5049___b","OP5051DNAc","OP5051DNAb","OP5053DNAc","OP5047___b","OP5043___b","OP5043___a","OP5052DNAa"),
gr = c("5","3","3","5","5","5","5","3","5","3","3","3","3","3","3","3","2","1","2","1","1","1","2","2","2","1","2","1","1","1","2","1","1","4","4","4","4","4","4"),
value = c("20.06915","20.06915","20.06915","20.06915","20.06915","20.06915","20.06915","20.06915","20.06915","20.06915","20.06915","20.06915","20.06915","20.06915","20.06915","20.06915","20.06915","20.06915","20.06915","20.06915","20.06915","20.06915","20.06915","20.06915","20.06915","20.06915","20.06915","20.06915","20.06915","20.06915","20.06915","20.06915","20.06915","20.06915","20.06915","20.06915","20.06915","20.06915","20.06915"))
list[[3]] <- dataframe3
dataframe4 <- data.frame(sample = c("OP2645ii_c","OP5048___e","OP5048___f","OP5046___d","OP2645ii_e","OP2645ii_a","OP5054DNAa","OP5048___c","OP2645ii_d","OP5048___b","OP5047___a","OP5048___h","OP5053DNAb","OP3088i__a","OP5048___g","OP5053DNAa","OP5049___a","OP2645ii_b","OP5046___c","OP5044___c","OP2413iiia","OP5054DNAc","OP5046___e","OP5054DNAb","OP5044___a","OP5046___a","OP5046___b","OP2413iiib","OP5051DNAa","OP5048___d","OP5044___b","OP5049___b","OP5051DNAc","OP5051DNAb","OP5053DNAc","OP5047___b","OP5043___b","OP5043___a","OP5052DNAa"),
gr = c("2","2","2","3","4","5","5","3","2","2","2","4","5","1","1","3","2","1","2","5","5","5","2","2","2","1","2","1","1","1","2","1","1","2","2","5","3","3","5"),
value = c("18.32500","18.32500","18.32500","18.32500","18.32500","18.32500","18.32500","18.32500","18.32500","18.32500","18.32500","18.32500","18.32500","18.32500","18.32500","18.32500","18.32500","18.32500","18.32500","18.32500","18.32500","18.32500","18.32500","18.32500","18.32500","18.32500","18.32500","18.32500","18.32500","18.32500","18.32500","18.32500","18.32500","18.32500","18.32500","18.32500","18.32500","18.32500","18.32500"))
list[[4]] <- dataframe4
Many thanks.
Cheers.
Deon
Using base functions, you could extract all the value columns into a matrix and use row means:
rowMeans(sapply(list, "[[", "value"))
For you sample data, you'd need to also convert to numeric (as below), but I'm hoping your real data has numbers not factors.
rowMeans(sapply(lapply(list, "[[", "value"), function(x) as.numeric(as.character(x))))
This just gives the values (and assumes the rows are in the right order). You can add the sample names with cbind, e.g., cbind(list[[1]][["sample"]], rowMeans(...)).
We can bind the list elements to a single data.frame, create a group of row_number() and use that to get the mean of 'value'
library(dplyr)
bind_rows(list, .id = 'id') %>%
mutate(value = as.numeric(value)) %>%
group_by(id) %>%
group_by(grp = row_number(), sample) %>%
summarise(value = mean(value, na.rm = TRUE))

In R, trying to convert a ragged CSV into data.frame of Value, list

I have an input file like:
1A10, 77002, 77003, 77010, 77020
1A20, 77002, 77006, 77007, 77019
1A30, 77006, 77019, 77098
1A40, 77007, 77019, 77027, 77098
1A50, 77005, 77007, 77019, 77024, 77027, 77046, 77081, 77098, 77401
etc....
I want to create a data frame (tibble) where the first column is the same as the first column of my csv, and the second column is a list corresponding to the rest of the columns.
I have failed miserably. Here is my last failure
library(stringr)
library(tidyverse)
options(stringsAsFactors = FALSE)
infile <- "~/Rprojects/CrimeStats/BeatZipcodes.csv"
# create empty data frame
BeatToZip <- data_frame(
beat=character(),
zips=list()
)
con=file(infile,open="r")
line=readLines(con)
long=length(line)
for (i in 1:long){
print(line[i])
line[i] <- trimws(line[i])
beat <- str_split(line[i],", *")[[1]][1]
zips <- as.list(str_split(line[i],", *")[[1]][-1])
temp <- data_frame(beat, zips)
BeatToZip <- rbind(BeatToZip, temp)
}
close(con)
One option after reading the file with read.csv and fill = TRUE
library(tidyverse)
df1 <- read.csv(infile, fill = TRUE, header = FALSE)
gather all the columns except the first one, grouped by the first column, summarise the other columns into a list
df1 %>%
gather(key, val, -1, na.rm = TRUE) %>%
group_by(key) %>%
summarise(listCol = list(val))

R rowsums if colnames match two arguments in a second attribute table

I want to calculate rowsums only if colnames (i.e. species) of my data frame match two arguments in a second attribute table. This means it shoul first match the name in a column of the attributes table AND have a certain entry in another column of the attribute table.
However, the attribute table contains more species than the orginal data frame.
I tried :
# Species data from vegan package:
data(varespec, package = "vegan")
# create attributes table
attributes <- matrix(NA, length(varespec), 2)
attributes[,1] <- colnames(varespec)
attributes[,2] <- c(rep("MI",14),rep("PI",30))
# add species to the attribute table
x <- c("spec1","MI")
y <- c("spec2","PI")
attributes <- rbind(attributes, x, y)
row.names(attributes) <- c(1:46)
# calculate rowsums only for species contained in the attributes table
# and having the entry "MI" in the attributes table
for (i in 1:44){
for (j in 1:46){
if ((colnames(varespec)[i] == attributes[j,1]) & (attributes[j,2] == "MI")) {
apply(varespec,1,sum)
}
}}
But it always summed up the whole rows and not only the MI - species.
This is easy to solve if you convert the dataset into a long format
library(dplyr)
library(tidyr)
data(varespec, package = "vegan")
attributes <- data.frame(
Species = c(colnames(varespec), "spec1", "spec2"),
Attribute = c(rep(c("MI", "PI"), c(14, 30)), "MI", "PI")
)
varespec %>%
add_rownames("ID") %>%
gather(Species, Value, -ID) %>% #convert to long format
inner_join(attributes, by = "Species") %>%
filter(Attribute == "MI") %>%
group_by(ID) %>%
summarise(Total = sum(Value))

Resources