Import missing data from a .txt file into existing data frame - r

I have a data frame with this structure:
structure(list(rowNumber = c(0, 1, 2, 3, 4, 5), rowLabel = c("IPU1",
"IPU1", "IPU1", "IPU1", "IPU1", "IPU1"), SampleTime = c(1.317302056,
1.327302056, 1.337302056, 1.347302056, 1.357302056, 1.367302056
), F0 = c(238.4728491, 238.4728491, 238.4728491, 238.4728491,
230.4871243, 235.301327), mother = c("french", "french", "french",
"french", "french", "french"), level = c("bil", "bil", "bil",
"bil", "bil", "bil"), name = c("clemence", "clemence", "clemence",
"clemence", "clemence", "clemence"), task = c("film", "film",
"film", "film", "film", "film"), lang = c("fr", "fr", "fr", "fr",
"fr", "fr"), f0st = c(94.7721745186803, 94.7721745186803, 94.7721745186803,
94.7721745186803, 94.1825081930544, 94.5403877585993), gender = c("F",
"F", "F", "F", "F", "F"), f0stnorm = c(1.11260538951537, 1.11260538951537,
1.11260538951537, 1.11260538951537, 0.934191841072306, 1.0424743738019
)), row.names = c(NA, -6L), class = c("tbl_df", "tbl", "data.frame"))
But I realised that some data are missing, for example, I don't have the values for name = alan & task = film & lang = eng but I do have these data in a separate .txt file. Is there a way to import the data from my .txt file directly into the data frame?

Assuming the txt file is has a comma delimiter: Also assuming your df is df1 and txt file is df2.txt
library(data.table)
fwrite(df1, "file location/df2.txt", append = TRUE)
df1<-fread(df2.txt)

Related

calculate duration in a complex table

I have a table as shown.
df <- data.frame("name" = c("jack", "william", "david", "john"),
"01-Jan-19" = c(NA,"A",NA,"A"),
"01-Feb-19" = c("A","A",NA,"A"),
"01-Mar-19" = c("A","A","A","A"),
"01-Apr-19" = c("A","A","A","A"),
"01-May-19" = c(NA,"A","A","A"),
"01-Jun-19" = c("A","SA","A","SA"),
"01-Jul-19" = c("A","SA","A","SA"),
"01-Aug-19" = c(NA,"SA","A","SA"),
"01-Sep-19" = c(NA,"SA","A","SA"),
"01-Oct-19" = c("SA","SA","A","SA"),
"01-Nov-19" = c("SA","SA",NA,"SA"),
"01-Dec-19" = c("SA","SA","SA",NA),
"01-Jan-20" = c("SA","M","A","M"),
"01-Feb-20" = c("M","M","M","M"))
Over a time period, each person journeys through of position progression (3 position categories from A to SA to M). My objective is:
Calculate the average duration of A (assistant) position and SA (senior assistant) position. i.e. the duration between the date the first of one category appears, and the date the last of this category appears, regardless of missing data in between.
I transposed the data using R “gather” function
df1 <- gather (df, "date", "position", 2:15)
then I am not sure how to best proceed. What might be the best way to further approach this?
We can get the data in longer format and calculate the number of days between first date when the person was "SA" and the first date when he was "A".
library(dplyr)
df %>%
tidyr::pivot_longer(cols = -name, names_to = 'person', values_drop_na = TRUE) %>%
mutate(person = dmy(person)) %>%
group_by(name) %>%
summarise(avg_duration = person[match('SA', value)] - person[match('A', value)])
# name duration
# <fct> <drtn>
#1 david 275 days
#2 jack 242 days
#3 john 151 days
#4 william 151 days
If needed the mean value we can pull and then calculate mean by adding to the above chain
%>% pull(duration) %>% mean
#Time difference of 204.75 days
data
df <- structure(list(name = c("jack", "william", "david", "john"),
`01-Jan-19` = c(NA, "A", NA, "A"), `01-Feb-19` = c("A", "A",
NA, "A"), `01-Mar-19` = c("A", "A", "A", "A"), `01-Apr-19` = c("A",
"A", "A", "A"), `01-May-19` = c(NA, "A", "A", "A"), `01-Jun-19` = c("A",
"SA", "A", "SA"), `01-Jul-19` = c("A", "SA", "A", "SA"),
`01-Aug-19` = c(NA, "SA", "A", "SA"), `01-Sep-19` = c(NA,
"SA", "A", "SA"), `01-Oct-19` = c("SA", "SA", "A", "SA"),
`01-Nov-19` = c("SA", "SA", NA, "SA"), `01-Dec-19` = c("SA",
"SA", "SA", NA), `01-Jan-20` = c("SA", "M", "A", "M"), `01-Feb-20` = c("M",
"M", "M", "M")), row.names = c(NA, -4L), class = "data.frame")

How do I use bind_rows in a list of table?

Suppose I have three tables as below:
table1 <- structure(list(Pos = 1:6, A = c(16.8508287292818, 0, 0.552486187845304,
0, 1.10497237569061, 1.38121546961326), C = c(1.93370165745856,
0.276243093922652, 0.828729281767956, 0.276243093922652, 0, 0.552486187845304
), G = c(1.10497237569061, 2.48618784530387, 0.276243093922652,
0.828729281767956, 0.276243093922652, 0), T = c(0.828729281767956,
0, 0.828729281767956, 1.10497237569061, 0, 0)), .Names = c("Pos",
"A", "C", "G", "T"), row.names = c(NA, 6L), class = "data.frame")
table2<- structure(list(Pos = 1:6, A = c(4.15584415584416, 1.03896103896104,
0.779220779220779, 0.692640692640693, 2.25108225108225, 2.94372294372294
), C = c(1.12554112554113, 0.173160173160173, 0.173160173160173,
0.519480519480519, 0.173160173160173, 0.173160173160173), G = c(1.03896103896104,
0.346320346320346, 0.0865800865800866, 0.432900432900433, 0.519480519480519,
0.0865800865800866), T = c(2.77056277056277, 0.606060606060606,
0.25974025974026, 0.692640692640693, 0.346320346320346, 0.25974025974026
)), .Names = c("Pos", "A", "C", "G", "T"), row.names = c(NA,
6L), class = "data.frame")
table3 <- structure(list(Pos = 1:6, A = c(10.3492063492063, 0.317460317460317,
0.349206349206349, 0.920634920634921, 1.96825396825397, 1.23809523809524
), C = c(0.825396825396825, 0.126984126984127, 0.349206349206349,
0.317460317460317, 0.19047619047619, 0.253968253968254), G = c(0.761904761904762,
0.952380952380952, 0.285714285714286, 0.412698412698413, 0.126984126984127,
0.19047619047619), T = c(1.07936507936508, 0.412698412698413,
0.476190476190476, 0.253968253968254, 0.19047619047619, 0.253968253968254
)), .Names = c("Pos", "A", "C", "G", "T"), row.names = c(NA,
6L), class = "data.frame")
I have now saved the table names as files.table:
files.table <- paste0("table", seq(1:3))
My problem is that I could not run this bind_rows function to bind table1, table2 and table3 using files.table instead of listing all three tables. This is the error I get: Error in bind_rows_(x, .id) : Argument 1 must have names
This is the code I tried:
bind.table <- bind_rows(files.table, .id = "table") %>%
gather(Base, Percent, -Pos, -table)
The .id argument for bind_rows sets the name of the variable containing the name of the list item each row came from, not these names themselves. You set the table names by naming the items in the list. Then, bind_rows will get those names and put them into a column with a name you specify:
table_list <- list(table1, table2, table3)
names(table_list) <- paste0("table", seq(1:3))
bind.table <- bind_rows(table_list, .id = 'id')
From ?bind_rows:
Each argument can either be a data frame, a list that could be a data
frame, or a list of data frames
The easiest way to get the data frames into bind_rows is to assemble them into a list and then just pass the list of data frames in. As #joran suggests, the easiest way to do this is to load or generate them in a lapply function which will automatically output a list that can go into bind_rows.

Error in ggtexttable (ggpubr)

I'm trying to create a publication-ready table using the ggtexttable function from ggpubr. I have a data frame:
dput(df)
structure(list(feature = list("start_codon", "stop_codon", "intergenic",
"3UTR", "5UTR", "exon", "intron", "ncRNA", "pseudogene"),
observed = list(structure(1L, .Names = "start_codon"), structure(1L, .Names = "stop_codon"),
structure(418L, .Names = "intergenic"), structure(48L, .Names = "3UTR"),
structure(28L, .Names = "5UTR"), structure(223L, .Names = "exon"),
structure(578L, .Names = "intron"), structure(20L, .Names = "ncRNA"),
structure(1L, .Names = "pseudogene")), expected = list(
0.286, 0.286, 369.02, 72.461, 33.165, 257.869, 631.189,
48.491, 3.172), fc = list(3.5, 3.5, 1.1, 0.7, 0.8, 0.9,
0.9, 0.4, 0.3), test = list("enrichment", "enrichment",
"enrichment", "depletion", "depletion", "depletion",
"depletion", "depletion", "depletion"), sig = list("F",
"F", "T", "T", "F", "T", "T", "T", "F"), p_val = list(
"0.249", "0.249", "0.00186", "0.00116", "0.209", "0.00814",
"0.00237", "<1e-04", "0.175")), class = "data.frame", row.names = c(NA,
-9L), .Names = c("feature", "observed", "expected", "fc", "test",
"sig", "p_val"))
And when I try to turn this into a table:
ggtexttable(df)
I get the error:
Error in (function (label, parse = FALSE, col = "black", fontsize =
12, : unused arguments (label.feature = dots[[5]][1],
label.observed = dots[[6]][1], label.expected = dots[[7]][1],
label.fc = dots[[8]][1], label.test = dots[[9]][1], label.sig_val
= dots[[10]][1], label.p_val = dots[[11]][1])
Does anyone know what might be causing this?
This works fine:
df <- head(iris)
ggtexttable(df)
I have found the problem and solution which is going to work for you. First of all your data is not in proper format (nested list) thats why you were getting this error trying to display it. You can check what is the format of the dataset easily by pasting in your console: str(data)
Here is the solution to convert your data to data.frame:
first.step <- lapply(data, unlist)
second.step <- as.data.frame(first.step, stringsAsFactors = F)
Then you can easily use the function ggtexttable(second.step) and it displays the table with your data.

How to search for string patterns in another string and include a separator?

My data is structured as follows:
dput(head(CharacterAnalysis,5))
structure(list(Character = c("A", "a", "B", "b", "C"),
Descriptor = c("Jog", "Change Direction", "Shuffle", "Walk", "Stop"),
.Names = c("Character", "Descriptor"),
row.names = c(NA, 5L), class = "data.frame")
I wish to lookup the Character and relevant Descriptor in the following data frame, but am unsure how to do so:
dput(head(StringAnalysis,3))
structure(list(MovementString = c("ACb", "aAaB", "BbCa"),
.Names = c("MovementString"),
row.names = c(NA, 3L), class = "data.frame")
My expected outcome/ data frame would be:
dput(head(Output,3))
structure(list(MovementString = c("ACb", "aAaB", "BbCa"),
MovementPerformed = c("Jog/ Stop/ Walk", "Change Direction/ Jog/ Change Direction/ Shuffle", "Shuffle/ Walk/ Stop/ Change Direction")
.Names = c("MovementString", "MovementPerformed"),
row.names = c(NA, 3L), class = "data.frame")
I would like a forward stroke (/) or similar to separate each Descriptor as it signals a new movement. Any advice on how to please complete this? My data frame CharacterAnalysis is over 1 million rows long, so I do not wish to have to search for each MovementString separately!
Thank you.
CharacterAnalysis <-
structure(list(Character = c("A", "a", "B", "b", "C"),
Descriptor = c("Jog", "Change Direction", "Shuffle", "Walk", "Stop")),
.Names = c("Character", "Descriptor"),
row.names = c(NA, 5L), class = "data.frame")
Output <-
structure(list(MovementString = c("ACb", "aAaB", "BbCa"),
MovementPerformed = c("Jog/ Stop/ Walk", "Change Direction/ Jog/ Change Direction/ Shuffle", "Shuffle/ Walk/ Stop/ Change Direction")),
.Names = c("MovementString", "MovementPerformed"),
row.names = c(NA, 3L), class = "data.frame")
# A simple approach based on names
# Build the lookup table just once
m <- CharacterAnalysis$Descriptor
names(m) <- CharacterAnalysis$Character
# Build the MovementPerformed column
Output$MovementPerformed <-
sapply(strsplit(Output$MovementString,""),
FUN = function(x) paste(m[x], collapse = "/ "))

merge multiple dbf files with non-matching headers in R

I have over 800 dbf files which I need to import and merge in R. I have been able to bring in all of the files using this code:
library(foreign)
setwd("c:/temp/help/")
files <- list.files(pattern="\\.dbf$")
all.the.data <- lapply(files, read.dbf, as.is=FALSE)
DATA <- do.call("rbind",all.the.data)
However, these dbf files have different numbers of columns and even if they sometimes have the same number of columns, those headers may be different. Here are four of the dbf files to provide an example:
file01 <- structure(list(PLOTBUFFER = structure(1L, .Label = "1002_2km", class = "factor"),
VALUE_11 = 11443500, VALUE_31 = 13500, VALUE_42 = 928800,
VALUE_43 = 162000, VALUE_90 = 18900), .Names = c("PLOTBUFFER",
"VALUE_11", "VALUE_31", "VALUE_42", "VALUE_43", "VALUE_90"), row.names = c(NA,
-1L), class = "data.frame", data_types = c("C", "F", "F", "F",
"F", "F"))
file02 <- structure(list(PLOTBUFFER = structure(1L, .Label = "1002_5km", class = "factor"),
VALUE_11 = 66254400, VALUE_21 = 125100, VALUE_31 = 80100,
VALUE_41 = 4234500, VALUE_42 = 3199500, VALUE_43 = 4194000,
VALUE_52 = 376200, VALUE_90 = 72000), .Names = c("PLOTBUFFER",
"VALUE_11", "VALUE_21", "VALUE_31", "VALUE_41", "VALUE_42", "VALUE_43",
"VALUE_52", "VALUE_90"), row.names = c(NA, -1L), class = "data.frame", data_types = c("C",
"F", "F", "F", "F", "F", "F", "F", "F"))
file03 <- structure(list(PLOTBUFFER = structure(1L, .Label = "1003_2km", class = "factor"),
VALUE_11 = 1972800, VALUE_31 = 125100, VALUE_41 = 5316300,
VALUE_42 = 990900, VALUE_43 = 1995300, VALUE_52 = 740700,
VALUE_90 = 1396800, VALUE_95 = 25200), .Names = c("PLOTBUFFER",
"VALUE_11", "VALUE_31", "VALUE_41", "VALUE_42", "VALUE_43", "VALUE_52",
"VALUE_90", "VALUE_95"), row.names = c(NA, -1L), class = "data.frame", data_types = c("C",
"F", "F", "F", "F", "F", "F", "F", "F"))
file04 <- structure(list(PLOTBUFFER = structure(1L, .Label = "1003_5km", class = "factor"),
VALUE_11 = 43950600, VALUE_31 = 270000, VALUE_41 = 12969900,
VALUE_42 = 5105700, VALUE_43 = 12614400, VALUE_52 = 1491300,
VALUE_90 = 2055600, VALUE_95 = 70200), .Names = c("PLOTBUFFER",
"VALUE_11", "VALUE_31", "VALUE_41", "VALUE_42", "VALUE_43", "VALUE_52",
"VALUE_90", "VALUE_95"), row.names = c(NA, -1L), class = "data.frame", data_types = c("C",
"F", "F", "F", "F", "F", "F", "F", "F"))
I would like the dataframe to match this:
merged <- structure(list(PLOTBUFFER = structure(1:2, .Label = c("1002_2km",
"1002_5km"), class = "factor"), VALUE_11 = c(11443500, 66254400
), VALUE_21 = c(0, 125100), VALUE_31 = c(13500, 80100), VALUE_41 = c(0,
4234500), VALUE_42 = c(928800, 3199500), VALUE_43 = c(162000,
4194000), VALUE_52 = c(0, 376200), VALUE_90 = c(18900, 72000)), .Names = c("PLOTBUFFER",
"VALUE_11", "VALUE_21", "VALUE_31", "VALUE_41", "VALUE_42", "VALUE_43",
"VALUE_52", "VALUE_90"), class = "data.frame", row.names = c(NA,
-2L))
Where if there is a missing column from one dataset it simply is filled in with a zero or NULL.
Thanks
-al
The suggestion by #infominer worked for the 4 files I included as an example but when I tried to use merge_recurse on the large list of 802 elements, I received an error.
files <- list.files(pattern="\\.dbf$")
all.the.data <- lapply(files, read.dbf, as.is=FALSE)
merged <- merge_recurse(all.the.data)
Error: evaluation nested too deeply: infinite recursion / options(expressions=)?
Error during wrapup: evaluation nested too deeply: infinite recursion / options(expressions=)?
Use the package reshape
library(reshape)
merged.files <-merge_recurse(list(file01,file02,file03,file04))
Edit:
Try this code thanks to Ramnath
Reduce(function(...) merge(..., all=T),all.the.data)
adapted from https://stackoverflow.com/a/6947326/2747709

Resources