Using gsub for removing unwanted characters : facing issues - r

df$Claim_Value <- gsub("Rs.", "", df$`Total Amount Claimed`)
checked class(df$Total Amount Claimed): showing numeric
will gsub work for numeric column ?
Here df$'Total Amount Claimed' is a column which has amount with the text Rs.
For example : Rs.200000. Trying to remove Rs. from this column. so used gsub. Its working but showing amount in thousands and not in lakhs.
How to show amount in lakhs
structure(list(Approver = c("Amarjeet Singh", "Amit Barot", "Amit Barot",
"Amit Barot", "Amit Barot", "Amit Barot"), `Assigned To` = c("SOLUTIONS.MOHALI#PALENGINEERS.IN",
"CAMC2#SUDHIRGROUP.COM", "CAMC2#SUDHIRGROUP.COM", "CAMC2#SUDHIRGROUP.COM",
"CAMC2#SUDHIRGROUP.COM", "CAMC2#SUDHIRGROUP.COM"), `Resolution Date` = structure(c(1609341652,
1574165400, 1591818814, 1592327216, 1592397052, 1592496000), class = c("POSIXct",
"POSIXt"), tzone = "UTC"), `Allow Submit Till Date` = structure(c(NA,
1589414400, NA, NA, NA, NA), class = c("POSIXct", "POSIXt"), tzone = "UTC"),
`Amt App by CSS (Without Tax)` = c(NA, NA, NA, NA, NA, NA
), `ESN/Alternator No.` = c("AAG8045S126087", "84846607",
"22321621", "191014875", "25452001", "78939252"), `Auto Approved` = c("No",
"No", "No", "No", "No", "No"), BIS = c("N", "N", "N", "Y",
"Y", "N"), `Batch Amount` = c(NA, NA, NA, NA, NA, NA), `Batch Date` = c(NA,
NA, NA, NA, NA, NA), `Batch Number` = c(NA, NA, NA, NA, NA,
NA), `Category Of Service` = c("Maintenance or repair service",
"Maintenance or repair service", "Maintenance or repair service",
"Maintenance or repair service", "Maintenance or repair service",
"Maintenance or repair service"), `Claim Scope` = c("In Scope",
"In Scope", "In Scope", "In Scope", "In Scope", "In Scope"
), `Claim Type` = c("WARRANTY", "WARRANTY", "WARRANTY", "WARRANTY",
"WARRANTY", "WARRANTY"), `Customer Name` = c("SUDHIR POWER LIMITED",
"BHARGAV EARTH MOVERS", "WAGAD INFRA PROJECT PVT LTD", "SUDHIR POWER LIMITED",
"SUDHIR POWER LIMITED", "CORE MULTI SERVICE"), `Final Amount Approved...16` = c(NA,
NA, NA, NA, NA, NA), `Division Name` = c("Pal Engineers - Mohali",
"Sudhir (Ahmedabad) - Rajkot", "Sudhir Sales & Services Limited, Ahmedabad",
"Sudhir Sales & Services Limited, Ahmedabad", "Sudhir Sales & Services Limited, Ahmedabad",
"Sudhir Sales & Services Limited, Ahmedabad"), `Failure Type` = c("Warranty Failure",
"Warranty Failure", "Warranty Failure", "Warranty Failure",
"Warranty Failure", "Warranty Failure"), `GIEA Agreement Name` = c(NA,
NA, NA, NA, NA, NA), `Cummins Invoice Num` = c(NA, NA, NA,
NA, NA, NA), Agreement = c(NA, NA, NA, NA, NA, NA), `Problem Summary` = c("Electrical issue / PCC Controller issues / Starter / alternator issue / Battery",
"Engine not starting / Tripping / Not stopping", "Maintenance / General Check",
"Engine not starting / Tripping / Not stopping", "Engine not starting / Tripping / Not stopping",
"Leakages - Oil/ Fuel/ Coolant / Air"), `Resolution Summary` = c("After recharge battery tested and failed on load test replaced battery warranty",
"REPAIRED THE FUEL PUMP TAKEN TRAIL ALL PARAMETER LIMIT",
"Last service done by 23/12/2019 at 724 hours qt this time change air filter also.today service done at 974 hours.in between customer says top up oil 2.5 ltr then start the engine running ok now all parameters within limits.",
"attend site check & found starter loose connection then correct it & Suggested to customer requests load balances and require proper ventilation for dg set suction and discharge air .",
"ATTEND THE SITE OBSERVE ENGINE FOUND FAULT SHUTDOWN ERROR NEED TO VISIT OEM SIDE",
"ATTEND SITE CHECK & FOUND FUEL LEAKAGE FROM BLEIND PLUG THEN REMOVED IT & FITMENT GAIN & START ENGINE & FOUND ENGINE RUNNING WITHIN LIMIT.."
), `Claim Rejected` = c("Y", "Y", "Y", "Y", "Y", "Y"), `SR Number` = c("SR-PE-MO-2021-006884",
"SR-SU-RJ-1920-002793", "SR-SU-AH-2021-000683", "SR-SU-AH-2021-000857",
"SR-SU-AH-2021-000865", "SR-SU-AH-2021-000913"), `Service Type` = c(NA,
NA, NA, NA, NA, NA), `Sub Type` = c(NA, NA, NA, NA, NA, NA
), `Amount Claimed By Dealer` = c("Rs.5,721.00", "Rs.19,087.00",
"Rs.1,166.00", "Rs.836.00", "Rs.1,034.00", "Rs.2,057.00"),
`Processed By...29` = c("SOLUTIONS.MOHALI#PALENGINEERS.IN",
"CAMC2#SUDHIRGROUP.COM", "CAMC2#SUDHIRGROUP.COM", "CAMC2#SUDHIRGROUP.COM",
"CAMC2#SUDHIRGROUP.COM", "CAMC2#SUDHIRGROUP.COM"), `Claim #` = c("1-5W4ZZVR",
"1-5PWNEAT", "1-5QQ4Z4J", "1-5QWC86P", "1-5QXPYU1", "1-5QXU7VN"
), `Claim Category` = c("STANDARD", "STANDARD", "STANDARD",
"STANDARD", "STANDARD", "STANDARD"), `Claim Creation Date` = structure(c(1609844392,
1588360803, 1591890038, 1592481430, 1592577627, 1592582659
), class = c("POSIXct", "POSIXt"), tzone = "UTC"), `Created By` = c("1-5LS00O1",
"1-2CD07UT", "1-2CD07UT", "1-2CD07UT", "1-2CD07UT", "1-2CD07UT"
), `Currency Code` = c("INR", "INR", "INR", "INR", "INR",
"INR"), Partner = c(NA, NA, NA, NA, NA, NA), `Final Amount Approved...36` = c("Rs.0.00",
"Rs.0.00", "Rs.0.00", "Rs.0.00", "Rs.0.00", "Rs.0.00"), `Fund Req Category` = c(NA,
NA, NA, NA, NA, NA), Comments = c(NA, NA, NA, NA, NA, NA),
`Claim Name` = c("CLM-PE-MO-2021-002442", "CLM-SU-RJ-2021-000055",
"CLM-SU-AH-2021-000527", "CLM-SU-AH-2021-000627", "CLM-SU-AH-2021-000641",
"CLM-SU-AH-2021-000643"), `Organization Name` = c("Pal Engineers, Jammu",
"Sudhir Sales & Services Limited, Ahmedabad, AHMEDABAD",
"Sudhir Sales & Services Limited, Ahmedabad, AHMEDABAD",
"Sudhir Sales & Services Limited, Ahmedabad, AHMEDABAD",
"Sudhir Sales & Services Limited, Ahmedabad, AHMEDABAD",
"Sudhir Sales & Services Limited, Ahmedabad, AHMEDABAD"),
Period = c(NA, NA, NA, NA, NA, NA), `Pre-Approval #` = c(NA,
NA, NA, NA, NA, NA), `Processed By...43` = c("SOLUTIONS.MOHALI#PALENGINEERS.IN",
"CAMC2#SUDHIRGROUP.COM", "CAMC2#SUDHIRGROUP.COM", "CAMC2#SUDHIRGROUP.COM",
"CAMC2#SUDHIRGROUP.COM", "CAMC2#SUDHIRGROUP.COM"), `Program Account Name` = c(NA,
NA, NA, NA, NA, NA), `Program Name` = c(NA, NA, NA, NA, NA,
NA), `Promotion Name` = c("BTRY_CHANDIGARH", "CIL_20000",
"CIC_5000", "Warranty_BIS_RECON", "Warranty_BIS_RECON", "CIC_5000"
), Description = c(NA, NA, NA, NA, NA, NA), Status = c("Pending",
"Pending", "Pending", "Pending", "Pending", "Pending"), `Final Approval Date` = c(NA,
NA, NA, NA, NA, NA), `Submitted By` = c("SOLUTIONS.MOHALI#PALENGINEERS.IN",
"WARRANTY.AHD#SUDHIRGROUP.COM", "WARRANTY.AHD#SUDHIRGROUP.COM",
"WARRANTY.AHD#SUDHIRGROUP.COM", "WARRANTY.AHD#SUDHIRGROUP.COM",
"WARRANTY.AHD#SUDHIRGROUP.COM"), `Total Amount Approved` = c(0,
0, 0, 0, 0, 0), `Total Amount Claimed` = c("Rs.5,721.00",
"Rs.19,087.00", "Rs.1,166.00", "Rs.836.00", "Rs.1,034.00",
"Rs.2,057.00"), `Total Participation Amount` = c(NA, NA,
NA, NA, NA, NA), Updated = structure(c(1610113437, 1589227258,
1591896091, 1592491326, 1592645576, 1592839702), class = c("POSIXct",
"POSIXt"), tzone = "UTC"), `Updated By` = c("1-4YO9LU", "1-2QTU4R",
"1-SDR5", "1-SDRU", "1-SDRU", "1-SDR5"), `Resolved By FSL` = c("Y",
"N", "Y", "Y", "N", "N"), `Parts Warranty Claim` = c(NA,
NA, NA, NA, NA, NA), `Inbox Last Updated` = structure(c(1610113437,
1589227258, 1591896091, 1592491326, 1592645576, 1592839702
), class = c("POSIXct", "POSIXt"), tzone = "UTC"), `Claim Submitted Date` = structure(c(1609939043,
NA, NA, NA, NA, NA), class = c("POSIXct", "POSIXt"), tzone = "UTC"),
`Claim Rejection Reason` = c("Incorrect/Missing Commercial Bills",
"Incorrect/Missing Technical Documents", "HCS or KAM Approval Required",
"Incorrect/Missing Technical Documents", "Incorrect/Missing Technical Documents",
"Incorrect/Missing Technical Documents"), `Claim Categorization Reason` = c(NA,
NA, NA, NA, NA, NA), Aging = c(2.4278125, 244.16599537037,
213.276724537037, 206.387430555556, 204.60212962963, 202.355300925926
), AgeGroup = structure(c(2L, 6L, 6L, 6L, 6L, 6L), .Label = c("0-1 Days",
"2-4 Days", "5-7 Days", "8-15 Days", "16-30 Days", ">30 Days"
), class = "factor"), Zones = c(NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_
), Approver.y = c(NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_), Claim_Value = c(NA,
NA, NA, 836L, NA, NA)), row.names = c(NA, 6L), class = "data.frame")

The following should work fine.
as.numeric(gsub("Rs.", "", "Rs 2000"))
provided df$`Total Amount Claimed` column is character type and not a factor type.
For showing in lakhs and not in exponential format, use the option
options("scipen"=100, "digits"=4)

You can turn the values to numeric by using gsub in the folowing way :
df$`Total Amount Claimed`
#[1] "Rs.5,721.00" "Rs.19,087.00" "Rs.1,166.00" "Rs.836.00" "Rs.1,034.00" "Rs.2,057.00"
df$Claim_Value <- as.numeric(gsub('Rs\\.|,', '', df$`Total Amount Claimed`))
df$Claim_Value
#[1] 5721 19087 1166 836 1034 2057

Related

How to bring column name from wide dataset as row in long dataset if specified value corresponded with row using R

the input dataset shows a "wide" dataset that includes unique actors and next to their name are corresponding movies as column name with a 1 assigned if movie corresponds to actors portfolio.
structure(list(Actor = c("Brad Pitt", "Matt Damon", "Leonardo Dicaprio",
"Kate Winslet", "Jennifer Connoley", "Jude Law", "Gwenyth Paltrow"
), `Once upon a time in america` = c(NA, NA, NA, NA, 1, NA, NA
), `The Departed` = c(NA, 1, 1, NA, NA, NA, NA), `Once Upon a time in Hollywood` = c(1,
NA, 1, NA, NA, NA, NA), `the holiday` = c(NA, NA, NA, 1, NA,
1, NA), titanic = c(NA, NA, 1, 1, NA, NA, NA), contagion = c(NA,
1, NA, 1, NA, 1, 1), `the talented mr ripley` = c(NA, 1, NA,
NA, NA, 1, 1), `Oceans Eleven` = c(1, 1, NA, NA, NA, NA, NA),
`Blood Diamond` = c(NA, NA, 1, NA, 1, NA, NA)), class = c("tbl_df",
"tbl", "data.frame"), row.names = c(NA, -7L))
What I would like to do is to create a "long" dataset that shows actor and their corresponding movie by title in the following row if there was a 1 previously assigned under the movie title column. Below is how i'd like to see the output.
structure(list(Actor = c("Brad Pitt", "Brad Pitt", "Matt Damon",
"Matt Damon", "Matt Damon", "Leonardo Dicaprio", "Leonardo Dicaprio",
"Leonardo Dicaprio", "Leonardo Dicaprio", "Kate Winslet", "Kate Winslet",
"Kate Winslet", "Jennifer Connoley", "Jennifer Connoley", "Jude Law",
"Jude Law", "Jude Law", "Gwenyth Paltrow", "Gwenyth Paltrow"),
Movie = c("Once Upon a time in Hollywood", "Oceans Eleven",
"The Departed", "Contagion", "The Talented MR Ripley", "The Departed",
"Once Upon a time in Hollywood", "Titanic", "Blood Diamond",
"The Holiday", "Titanic", "Contagion", "Once Upon a time in America",
"Blood Diamond", "The Holiday", "Contagion", "The Talented MR Ripley",
"Contagion", "The Talented MR Ripley")), class = c("tbl_df",
"tbl", "data.frame"), row.names = c(NA, -19L))
just use pivot_longer() and filter() from tidyverse
library(tidyverse)
data %>% pivot_longer(!Actor,names_to="Movie",values_to="value") %>% dplyr::filter(!is.na(value))

Concatenating the strings of selected rows for every column

My data is as follows:
DF <- structure(list(toberevised = c("[Money amounts are in thousands of dollars]",
NA, NA, NA, "Item", NA, NA, NA, NA, "Number of returns", "Number of joint returns",
"Number with paid preparer's signature", "Number of exemptions",
"Adjusted gross income (AGI) [3]", "Salaries and wages in AGI: [4] Number",
"Salaries and wages in AGI: Amount", "Taxable interest: Number",
"Taxable interest: Amount", "Ordinary dividends: Number", "Ordinary dividends: Amount"
), ...2 = c("UNITED STATES [2]", NA, NA, NA, "All returns", NA,
NA, "1", NA, "135257620", "52607676", "80455243", "273738434",
"7364640131", "114060887", "5161583318", "59553985", "161324824",
"31158675", "164247298"), ...3 = c(NA, NA, NA, NA, "Under", "$50,000 [1]",
NA, "2", NA, "92150166", "20743943", "53622647", "159649737",
"1797097083", "75422766", "1541276272", "28527550", "39043002",
"13174923", "23867893"), ...4 = c(NA, NA, "Size of adjusted gross income",
NA, "50000", "under", "75000", "3", NA, "18221115", "11329459",
"11025624", "44189517", "1119634632", "16299827", "896339313",
"10891905", "16353293", "5255958", "12810282"), ...5 = c(NA,
NA, NA, NA, "75000", "under", "100000", "4", NA, "10499106",
"8296546", "6260725", "28555195", "905336768", "9520214", "721137490",
"7636612", "12852148", "4095938", "11524298"), ...6 = c(NA, NA,
NA, NA, "100000", "under", "200000", "5", NA, "10797979", "9193700",
"6678965", "30919226", "1429575727", "9782173", "1083175205",
"9092673", "23160862", "5824522", "25842394"), ...7 = c(NA, NA,
NA, NA, "200000", "or more", NA, "6", NA, "3589254", "3044028",
"2867282", "10424759", "2112995921", "3035907", "919655038",
"3405245", "69915518", "2807334", "90202431")), row.names = c(NA,
-20L), class = c("tbl_df", "tbl", "data.frame"))
All I would like to do is concatenate for each column, rows 5, 6 and 7. I tried:
DF[,5:7] <- lapply(DF[,5:7], paste(DF[,5:7],collapse=" "))
But I get the error:
Error in get(as.character(FUN), mode = "function", envir = envir) :
variable names are limited to 10000 bytes
This happens even when I concatenate one row it with another empty row instead (which obviously should not be much more bytes)!
lapply(DF[5:7, ], paste, collapse=" ")

How to create a for loop based on unique user IDs and specific event types

I have two data frames: users and events.
Both data frames contain a field that links events to users.
How can I create a for loop where every user's unique ID is matched against an event of a particular type and then stores the number of occurrences into a new column within users (users$conversation_started, users$conversation_missed, etc.)?
In short, it is a conditional for loop.
So far I have this but it is wrong:
for(i in users$id){
users$conversation_started <- nrow(event[event$type = "conversation-started"])
}
An example of how to do this would be ideal.
The idea is:
for(each user)
find the matching user ID in events
count the number of event types == "conversation-started"
assign count value to user$conversation_started
end for
Important note:
The type field can contain one of five values so I will need to be able to effectively filter on each type for each associate:
> events$type %>% table %>% as.matrix
[,1]
conversation-accepted 3120
conversation-already-accepted 19673
conversation-declined 27
conversation-missed 831
conversation-request 23427
Data frames (note that these are reduced versions as confidential information has been removed):
users <- structure(list(`_id` = c("JTuXhdI4Ai", "iGIeCEXyVE", "6XFtOJh0bD",
"mNN986oQv9", "9NI71KBMX9", "x1jH7t0Cmy"), language = c("en",
"en", "en", "en", "en", "en"), registering = c(TRUE, TRUE, FALSE,
FALSE, FALSE, NA), `_created_at` = structure(c(1485995043.131,
1488898839.838, 1480461193.146, 1481407887.979, 1489942757.189,
1491311381.916), class = c("POSIXct", "POSIXt"), tzone = "UTC"),
`_updated_at` = structure(c(1521039527.236, 1488898864.834,
1527618624.877, 1481407959.116, 1490043838.561, 1491320333.09
), class = c("POSIXct", "POSIXt"), tzone = "UTC"), lastOnlineTimestamp = c(1521039526.90314,
NA, 1480461472, 1481407959, 1490043838, NA), isAgent = c(FALSE,
NA, FALSE, FALSE, FALSE, NA), lastAvailableTime = structure(c(NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_), class = c("POSIXct",
"POSIXt"), tzone = ""), available = c(NA, NA, NA, NA, NA,
NA), busy = c(NA, NA, NA, NA, NA, NA), joinedTeam = structure(c(NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_), class = c("POSIXct",
"POSIXt"), tzone = ""), timezone = c(NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_
)), row.names = c("list.1", "list.2", "list.3", "list.4",
"list.5", "list.6"), class = "data.frame")
and
events <- structure(list(`_id` = c("JKY8ZwkM1S", "CG7Xj8dAsA", "pUkFFxoahy",
"yJVJ34rUCl", "XxXelkIFh7", "GCOsENVSz6"), expirationTime = structure(c(1527261147.873,
NA, 1527262121.332, NA, 1527263411.619, 1527263411.619), class = c("POSIXct",
"POSIXt"), tzone = ""), partId = c("d22bfddc-cd51-489f-aec8-5ab9225c0dd5",
"d22bfddc-cd51-489f-aec8-5ab9225c0dd5", "cf4356da-b63e-4e4d-8e7b-fb63035801d8",
"cf4356da-b63e-4e4d-8e7b-fb63035801d8", "a720185e-c300-47c0-b30d-64e1f272d482",
"a720185e-c300-47c0-b30d-64e1f272d482"), type = c("conversation-request",
"conversation-accepted", "conversation-request", "conversation-accepted",
"conversation-request", "conversation-request"), `_p_conversation` = c("Conversation$6nSaLeWqs7",
"Conversation$6nSaLeWqs7", "Conversation$6nSaLeWqs7", "Conversation$6nSaLeWqs7",
"Conversation$bDuAYSZgen", "Conversation$bDuAYSZgen"), `_p_merchant` = c("Merchant$0A2UYADe5x",
"Merchant$0A2UYADe5x", "Merchant$0A2UYADe5x", "Merchant$0A2UYADe5x",
"Merchant$0A2UYADe5x", "Merchant$0A2UYADe5x"), `_p_associate` = c("D9ihQOWrXC",
"D9ihQOWrXC", "D9ihQOWrXC", "D9ihQOWrXC", "D9ihQOWrXC", "D9ihQOWrXC"
), `_wperm` = list(list(), list(), list(), list(), list(), list()),
`_rperm` = list("*", "*", "*", "*", "*", "*"), `_created_at` = structure(c(1527264657.998,
1527264662.043, 1527265661.846, 1527265669.435, 1527266922.056,
1527266922.059), class = c("POSIXct", "POSIXt"), tzone = "UTC"),
`_updated_at` = structure(c(1527264657.998, 1527264662.043,
1527265661.846, 1527265669.435, 1527266922.056, 1527266922.059
), class = c("POSIXct", "POSIXt"), tzone = "UTC"), read = c(TRUE,
NA, TRUE, NA, NA, NA), data.customerName = c("Shopper 109339",
NA, "Shopper 109339", NA, "Shopper 109364", "Shopper 109364"
), data.departmentName = c("Personal advisors", NA, "Personal advisors",
NA, "Personal advisors", "Personal advisors"), data.recurring = c(FALSE,
NA, TRUE, NA, FALSE, FALSE), data.new = c(TRUE, NA, FALSE,
NA, TRUE, TRUE), data.missed = c(0L, NA, 0L, NA, 0L, 0L),
data.customerId = c("84uOFRLmLd", "84uOFRLmLd", "84uOFRLmLd",
"84uOFRLmLd", "5Dw4iax3Tj", "5Dw4iax3Tj"), data.claimingTime = c(NA,
4L, NA, 7L, NA, NA), data.lead = c(NA, NA, FALSE, NA, NA,
NA), data.maxMissed = c(NA, NA, NA, NA, NA, NA), data.associateName = c(NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_), data.maxDecline = c(NA, NA, NA, NA, NA, NA
), data.goUnavailable = c(NA, NA, NA, NA, NA, NA)), row.names = c("list.1",
"list.2", "list.3", "list.4", "list.5", "list.6"), class = "data.frame")
Update: 21st September 2018
This solution now results in an NA-only data frame being produced at the end of the function. When written to a .csv, this is what I get (naturally, Excel displays NA-values as blank values):
My data source has not changed, nor has my script.
What might be causing this?
My guess is that this is an unforeseen case where there may have been 0 hits for each step has occurred; as such, is there a way to add 0 to those cases where there weren't any hits, rather than NA/ blank values?
Is there a way to avoid this?
New solution based on the provided data.
Note: As your data had no overlap in _id, I changed the events$_id to be the same as in users.
Simplified example data:
users <- structure(list(`_id` = structure(c(4L, 3L, 1L, 5L, 2L, 6L),
.Label = c("6XFtOJh0bD", "9NI71KBMX9", "iGIeCEXyVE",
"JTuXhdI4Ai", "mNN986oQv9", "x1jH7t0Cmy"),
class = "factor")), .Names = "_id",
row.names = c(NA, -6L), class = "data.frame")
events <- structure(list(`_id` = c("JKY8ZwkM1S", "CG7Xj8dAsA", "pUkFFxoahy",
"yJVJ34rUCl", "XxXelkIFh7", "GCOsENVSz6"),
type = c("conversation-request", "conversation-accepted",
"conversation-request", "conversation-accepted",
"conversation-request", "conversation-request")),
.Names = c("_id", "type"), class = "data.frame",
row.names = c("list.1", "list.2", "list.3", "list.4", "list.5", "list.6"))
events$`_id` <- users$`_id`
> users
_id
1 JTuXhdI4Ai
2 iGIeCEXyVE
3 6XFtOJh0bD
4 mNN986oQv9
5 9NI71KBMX9
6 x1jH7t0Cmy
> events
_id type
list.1 JTuXhdI4Ai conversation-request
list.2 iGIeCEXyVE conversation-accepted
list.3 6XFtOJh0bD conversation-request
list.4 mNN986oQv9 conversation-accepted
list.5 9NI71KBMX9 conversation-request
list.6 x1jH7t0Cmy conversation-request
We can use the same approach I suggested before, just enhance it a bit.
First we loop over unique(events$type) to store a table() of every type of event per id in a list:
test <- lapply(unique(events$type), function(x) table(events$`_id`, events$type == x))
Then we store the specific type as the name of the respective table in the list:
names(test) <- unique(events$type)
Now we use a simple for-loop to match() the user$_id with the rownames of the table and store the information in a new variable with the name of the event type:
for(i in names(test)){
users[, i] <- test[[i]][, 2][match(users$`_id`, rownames(test[[i]]))]
}
Result:
> users
_id conversation-request conversation-accepted
1 JTuXhdI4Ai 1 0
2 iGIeCEXyVE 0 1
3 6XFtOJh0bD 1 0
4 mNN986oQv9 0 1
5 9NI71KBMX9 1 0
6 x1jH7t0Cmy 1 0
Hope this helps!

R : Merge 2 different data frames every x rows ( every 5 rows then 7 then maybe 3 etc)

I need some help merging 2 data frames in RStudio.
I would like to merge my PP data frame into my MergeHeader data frame following this pattern :
Every time there is an incremented number in the
instance_reporting_properties.Numéro_x0020_du_x0020_formulaire column, I want to merge one row of the PP data frame.
For example,
I want to add the 5th row of PP data frame to the row with the Value 5 in the mergeHeader data frame.
Then nothing for a few rows and do it again for the 6th row of PP.
And so forth.
The trick being that there’s not the same number of NA rows between each value(in MergeHeader dataframe).
Here a simplified version of my data frames :
> dput(head(tmpPP, 5))
structure(list(`_x0031__x0020_-_x0020_Type_x0020_de_x0020_demande_x0020__x0028_Alias_x0029_` = c("Demande de création",
"Demande de création", NA, NA), `_x0032__x0020_-_x0020_Numéro_x0020_du_x0020_formulaire_x0020__x0028_Alias_x0029_` = c("N 5",
"N 6", NA, NA), `_x0033__x0020_-_x0020_Demandeur_x0020__x0028_Alias_x0029_` = c("par MOLLE Francois pour",
"par CABRERA Fabienne pour", NA, NA), `_x0034__x0020_-_x0020_Fournisseur_x0020__x0028_Alias_x0029_` = c("MONNIEZ CHRISTOPHE JACQUES",
"A . C . C . REFERENCE TOURS", NA, NA), `_x0035__x0020_-_x0020_Date_x0020_de_x0020_la_x0020_demande_x0020__x0028_Alias_x0029_` = c("le Monday, January 2, 2017 8:44:07 AM",
"le Monday, January 2, 2017 8:56:47 AM", NA, NA), Code_x0020_fournisseur = c("FA01070-59242",
"FA01792-74000", "FA01072-68110", "En cours"), Code_x0020_NAF = c("1071C",
"7911Z", NA, NA), Date_x0020_de_x0020_la_x0020_demande = c("1/2/2017",
"1/2/2017", "1/2/2017", "1/2/2017"), `Délai_x0020_de_x0020_paiement` = c("45 jours fin de mois",
"45 jours fin de mois", "Comptant", "Comptant"), Demandeur = c("MOLLE Francois",
"CABRERA Fabienne", "DI MARCO Elio", "CHOPIN Anne Flore"), Famille_x0020_Achat = c("061101 - Consommables alimentaires, boissons et dérivés",
"060302 - Agences de voyages", NA, NA), Fournisseur = c("MONNIEZ CHRISTOPHE JACQUES",
"A . C . C . REFERENCE TOURS", "CONTROLE TECHNIQUE PASSION ILLZACH",
"DIRECTION REGIONALE FINANCES PUBLIQUES PROVENCE ALPES COTE D'AZUR ET DEPARTEMENT BOUCHES DU RHONE"
), Mode_x0020_de_x0020_paiement_x0020_1 = c("VIR", "VIR", "VIR",
"VIR"), Nom_x0020_Approbateur = c("Vincent DESTOT", "BURTEY Thomas",
"Laurent GELHAYE", NA), Nom_x0020_Gestionnaire_x0020_RFN = c("Elisa WATELLOO",
"KOLANUS Karolina", "Elisa WATELLOO", "Elisa WATELLOO"), `Numéro_x0020_du_x0020_formulaire` = c("5",
"6", "7", "8"), Personne_x0020_en_x0020_charge = c("Réviseurs",
"Réviseurs", "Réviseurs", NA), `Pôle_x0020_du_x0020_demandeur` = c("Hauts De France",
"Bourgogne - Franche - Comté", "Grand Est", "Société EAP"),
Statut = c("Finalisée", "Finalisée", "Finalisée", "Demande refusée Gestionnaire"
), Type_x0020_de_x0020_demande = c("Demande de création",
"Demande de création", "Demande de création", "Demande de création"
), Type_x0020_de_x0020_fournisseur = c("Français", NA, "Français",
"Français"), `Date_x0020_de_x0020_création` = c("1/2/2017",
"1/1/0001", "1/2/2017", NA), Date_x0020_de_x0020_rejet = c(NA,
NA, "1/1/0001", NA), Prestataire_x0020_de_x0020_service = c(NA,
"Oui", "Oui", NA), Mode_x0020_de_x0020_paiement_x0020_2 = c(NA_character_,
NA_character_, NA_character_, NA_character_), Motif_x0020_de_x0020_rejet = c(NA_character_,
NA_character_, NA_character_, NA_character_), `Activité_x0020_Tourisme` = c(NA_character_,
NA_character_, NA_character_, NA_character_)), .Names = c("_x0031__x0020_-_x0020_Type_x0020_de_x0020_demande_x0020__x0028_Alias_x0029_",
"_x0032__x0020_-_x0020_Numéro_x0020_du_x0020_formulaire_x0020__x0028_Alias_x0029_",
"_x0033__x0020_-_x0020_Demandeur_x0020__x0028_Alias_x0029_",
"_x0034__x0020_-_x0020_Fournisseur_x0020__x0028_Alias_x0029_",
"_x0035__x0020_-_x0020_Date_x0020_de_x0020_la_x0020_demande_x0020__x0028_Alias_x0029_",
"Code_x0020_fournisseur", "Code_x0020_NAF", "Date_x0020_de_x0020_la_x0020_demande",
"Délai_x0020_de_x0020_paiement", "Demandeur", "Famille_x0020_Achat",
"Fournisseur", "Mode_x0020_de_x0020_paiement_x0020_1", "Nom_x0020_Approbateur",
"Nom_x0020_Gestionnaire_x0020_RFN", "Numéro_x0020_du_x0020_formulaire",
"Personne_x0020_en_x0020_charge", "Pôle_x0020_du_x0020_demandeur",
"Statut", "Type_x0020_de_x0020_demande", "Type_x0020_de_x0020_fournisseur",
"Date_x0020_de_x0020_création", "Date_x0020_de_x0020_rejet",
"Prestataire_x0020_de_x0020_service", "Mode_x0020_de_x0020_paiement_x0020_2",
"Motif_x0020_de_x0020_rejet", "Activité_x0020_Tourisme"), .internal.selfref = <pointer: (nil)>, row.names = 5:8, class = c("data.table",
"data.frame"))
dputTmp <- dput(head(tmp,18))
structure(list(`instance_reporting_properties.Numéro_x0020_du_x0020_formulaire` = c("5",
NA, NA, NA, NA, "6", NA, NA, NA, NA, NA, NA, "7", NA, NA, NA,
NA, "8"), `instance_reporting_properties.Pôle_x0020_de_x0020_rattachement` = c("Hauts De France",
NA, NA, NA, NA, "Bourgogne - Franche - Comté", NA, NA, NA, NA,
NA, NA, "Grand Est", NA, NA, NA, NA, "Société EAP")), .Names = c("instance_reporting_properties.Numéro_x0020_du_x0020_formulaire",
"instance_reporting_properties.Pôle_x0020_de_x0020_rattachement"
), row.names = 21:38, class = "data.frame")

Extract specific columns from dataset, create column of NAs if it doesn't exist

Data frame df has 57 columns. I later read in other csv files, each of which may have the same 57, but more likely have more or fewer columns. I take the names of the original file as:
df = read.csv(...)
str = colnames(df)
I know I can take subsets of a data frame as:
file = read.csv(...)
file = file[, str]
If the columns of file have the same or greater number of columns than the original 57, this will work fine. The extra columns would simply be dropped. However, if the columns of file are fewer than the original 57, the following error arises:
Error in `[.data.frame`(file, , str) : undefined columns selected
Is there a way to take this same approach, but create columns of NA if the column does not exist in file?
EDIT: Including dput ouput for #akrun. I'm not familiar with dput so I hope this is what you were asking for:
File 1 example:
`structure(list(ObservationURI = c("http://resources.usgin.org/uri-gin/wygs/bhtemp/49-037-20341_182_12296/",
"http://resources.usgin.org/uri-gin/wygs/bhtemp/49-037-20341_215_14316/",
"http://resources.usgin.org/uri-gin/wygs/bhtemp/49-037-20341_236_16496/"
), WellName = c("1 BRADY UNIT ANADARKO E&P COMPANY LP", "1 BRADY UNIT ANADARKO E&P COMPANY LP",
"1 BRADY UNIT ANADARKO E&P COMPANY LP"), APINo = c("49-037-20341",
"49-037-20341", "49-037-20341"), HeaderURI = c("http://resources.usgin.org/uri-gin/wygs/well/3720341/",
"http://resources.usgin.org/uri-gin/wygs/well/3720341/", "http://resources.usgin.org/uri-gin/wygs/well/3720341/"
), OtherID = c(3720341, 3720341, 3720341), OtherName = c(NA,
NA, NA), BoreholeName = c(NA, NA, NA), Label = c("Temperature observation for well 3720341",
"Temperature observation for well 3720341", "Temperature observation for well 3720341"
), Operator = c("", "", ""), LeaseName = c("", "", ""), LeaseOwner = c("",
"", ""), LeaseNo = c("", "", ""), SpudDate = c("1900-01-01T00:00",
"1900-01-01T00:00", "1900-01-01T00:00"), EndedDrillingDate = c("",
"", ""), WellType = c("Oil", "Oil", "Oil"), Status = c("Producing Oil Well",
"Producing Oil Well", "Producing Oil Well"), CommodityOfInterest = c("",
"", ""), StatusDate = c("1973-05-03T00:00:00", "1973-05-03T00:00:00",
"1973-05-03T00:00:00"), Function = c(NA, NA, NA), Production = c(NA,
NA, NA), ProducingInterval = c(NA, NA, NA), ReleaseDate = c(NA,
NA, NA), Field = c("", "", ""), OtherLocationName = c("Great Divide Basin",
"Great Divide Basin", "Great Divide Basin"), County = c("Sweetwater",
"Sweetwater", "Sweetwater"), State = c("WY", "WY", "WY"), PLSS_Meridians = c(NA,
NA, NA), TWP = c("16N", "16N", "16N"), RGE = c("101W", "101W",
"101W"), Section_ = c(11, 11, 11), SectionPart = c("NENW", "NENW",
"NENW"), Parcel = c(NA, NA, NA), UTM_E = c(NA, NA, NA), UTM_N = c(NA,
NA, NA), UTMDatumZone = c(NA, NA, NA), LatDegree = c(41.38696,
41.38696, 41.38696), LongDegree = c(-108.75009, -108.75009, -108.75009
), SRS = c("EPSG:4326", "EPSG:4326", "EPSG:4326"), LocationUncertaintyStatement = c("nil:missing",
"nil:missing", "nil:missing"), LocationUncertaintyCode = c(NA,
NA, NA), LocationUncertaintyRadius = c(NA, NA, NA), DrillerTotalDepth = c(NA_real_,
NA_real_, NA_real_), DepthReferencePoint = c(NA, NA, NA), LengthUnits = c("ft",
"ft", "ft"), WellBoreShape = c(NA, NA, NA), TrueVerticalDepth = c(NA,
NA, NA), ElevationKB = c(7135, 7135, 7135), ElevationDF = c(7106,
7106, 7106), ElevationGL = c(0, 0, 0), FormationTD = c("", "",
""), BitDiameterCollar = c(NA, NA, NA), BitDiameterTD = c(NA_real_,
NA_real_, NA_real_), DiameterUnits = c("", "", ""), Notes = c("Depth of measurement assumed to be equal to driller total depth (CRC-AZGS, 2013).",
"Depth of measurement assumed to be equal to driller total depth (CRC-AZGS, 2013).",
"Depth of measurement assumed to be equal to driller total depth (CRC-AZGS, 2013)."
), MaximumRecordedTemperature = c(NA_real_, NA_real_, NA_real_
), MeasuredTemperature = c(182, 215, 236), CorrectedTemperature = c(NA_real_,
NA_real_, NA_real_), TemperatureUnits = c(FALSE, FALSE, FALSE
), TimeSinceCirculation = c(NA_real_, NA_real_, NA_real_), CirculationDuration = c(11,
12, 12), MeasurementProcedure = c("Well log", "Well log", "Well log"
), CorrectionType = c(NA, NA, NA), DepthOfMeasurement = c(-99999,
-99999, -99999), MeasurementDateTime = c("", "", ""), MeasurementFormation = c("",
"", ""), MeasurementSource = c("Richard W. Davis: Deriving geothermal parameters from bottom-hole temperatures in Wyoming\" AAPG bulletin, V. 96, No. 8 (August 2012), pp. 1579-1592",
"Richard W. Davis: Deriving geothermal parameters from bottom-hole temperatures in Wyoming\" AAPG bulletin, V. 96, No. 8 (August 2012), pp. 1579-1592",
"Richard W. Davis: Deriving geothermal parameters from bottom-hole temperatures in Wyoming\" AAPG bulletin, V. 96, No. 8 (August 2012), pp. 1579-1592"
), RelatedResource = c(NA, NA, NA), CasingLogger = c(NA, NA,
NA), CasingBottomDepthDriller = c(NA, NA, NA), CasingTopDepth = c(NA_real_,
NA_real_, NA_real_), CasingPipeDiameter = c(NA, NA, NA), CasingWeight = c(NA,
NA, NA), CasingWeightUnits = c(NA, NA, NA), CasingThickness = c(NA,
NA, NA), DrillingFluid = c("", "", ""), Salinity = c(NA_real_,
NA_real_, NA_real_), MudResistivity = c(NA_real_, NA_real_, NA_real_
), Density = c(NA_real_, NA_real_, NA_real_), FluidLevel = c(NA_real_,
NA_real_, NA_real_), pH = c(NA_real_, NA_real_, NA_real_), Viscosity = c(NA_real_,
NA_real_, NA_real_), FluidLoss = c(NA_real_, NA_real_, NA_real_
), MeasurementNotes = c(NA, NA, NA), InformationSource = c("Wyoming State Geological Survey",
"Wyoming State Geological Survey", "Wyoming State Geological Survey"
)), .Names = c("ObservationURI", "WellName", "APINo", "HeaderURI",
"OtherID", "OtherName", "BoreholeName", "Label", "Operator",
"LeaseName", "LeaseOwner", "LeaseNo", "SpudDate", "EndedDrillingDate",
"WellType", "Status", "CommodityOfInterest", "StatusDate", "Function",
"Production", "ProducingInterval", "ReleaseDate", "Field", "OtherLocationName",
"County", "State", "PLSS_Meridians", "TWP", "RGE", "Section_",
"SectionPart", "Parcel", "UTM_E", "UTM_N", "UTMDatumZone", "LatDegree",
"LongDegree", "SRS", "LocationUncertaintyStatement", "LocationUncertaintyCode",
"LocationUncertaintyRadius", "DrillerTotalDepth", "DepthReferencePoint",
"LengthUnits", "WellBoreShape", "TrueVerticalDepth", "ElevationKB",
"ElevationDF", "ElevationGL", "FormationTD", "BitDiameterCollar",
"BitDiameterTD", "DiameterUnits", "Notes", "MaximumRecordedTemperature",
"MeasuredTemperature", "CorrectedTemperature", "TemperatureUnits",
"TimeSinceCirculation", "CirculationDuration", "MeasurementProcedure",
"CorrectionType", "DepthOfMeasurement", "MeasurementDateTime",
"MeasurementFormation", "MeasurementSource", "RelatedResource",
"CasingLogger", "CasingBottomDepthDriller", "CasingTopDepth",
"CasingPipeDiameter", "CasingWeight", "CasingWeightUnits", "CasingThickness",
"DrillingFluid", "Salinity", "MudResistivity", "Density", "FluidLevel",
"pH", "Viscosity", "FluidLoss", "MeasurementNotes", "InformationSource"
), row.names = c(NA, 3L), class = "data.frame")`
File 2 example:
`structure(list(ObservationURI = c("http://resources.usgin.org/uri-gin/mags/bhtemp/UM:MA-Weston47-422036N0711640.1/",
"http://resources.usgin.org/uri-gin/mags/bhtemp/UM:MA-Dover20-421431N0711752.1/",
"http://resources.usgin.org/uri-gin/mags/bhtemp/UM:MA-Lincoln13-422440N0711815.1/"
), WellName = c("Weston47-USGS HDR19", "Dover20-USGS HDR19",
"Lincoln13-USGS HDR19"), APINo = c(NA, NA, NA), HeaderURI = c("http://resources.usgin.org/uri-gin/mags/well/Weston47-USGS_HDR19/",
"http://resources.usgin.org/uri-gin/mags/well/Dover20-USGS_HDR19/",
"http://resources.usgin.org/uri-gin/mags/well/Lincoln13-USGS_HDR19/"
), OtherID = c("", "", ""), OtherName = c("", "", ""), BoreholeName = c(NA,
NA, NA), Operator = c(NA, NA, NA), LeaseOwner = c(NA, NA, NA),
LeaseNo = c(NA, NA, NA), SpudDate = c(NA, NA, NA), EndedDrillingDate = c("",
"", ""), WellType = c("temporarily abandoned", "observation",
"observation"), Status = c("Idle", "Idle", "Idle"), CommodityOfInterest = c("Water",
"Water", "Water"), StatusDate = c("", "", ""), Function = c("production",
"monitoring", "monitoring"), Production = c(NA, NA, NA),
Field = c(NA, NA, NA), County = c("Middlesex", "Norfolk",
"Middlesex"), State = c("MA", "MA", "MA"), PLSS_Meridians = c(NA,
NA, NA), TWP = c(NA, NA, NA), RGE = c(NA, NA, NA), Section_ = c(NA,
NA, NA), SectionPart = c(NA, NA, NA), Parcel = c(NA, NA,
NA), UTM_E = c(NA, NA, NA), UTM_N = c(NA, NA, NA), LatDegree = c(42.3147771183,
42.2417748607, 42.4110851252), LongDegree = c(-71.3257301787,
-71.2975422044, -71.3034583949), SRS = c("EPSG:4326", "EPSG:4326",
"EPSG:4326"), LocationUncertaintyStatement = c("Field located on topographic map",
"Field located on topographic map", "Field located on topographic map"
), DrillerTotalDepth = c(29, 22, 20), LengthUnits = c("ft",
"ft", "ft"), WellBoreShape = c("Vertical", "Vertical", "Vertical"
), TrueVerticalDepth = c(NA, NA, NA), ElevationGL = c(140,
150, 180), BitDiameterTD = c(72, 48, 42), DiameterUnits = c("in",
"in", "in"), Notes = c("", "", ""), MeasuredTemperature = c(8,
9, 8.5), CorrectedTemperature = c(NA, NA, NA), TemperatureUnits = c("C",
"C", "C"), TimeSinceCirculation = c(NA, NA, NA), CirculationDuration = c(NA,
NA, NA), MeasurementProcedure = c("Samples collected from spigot or faucet nearest to well. Water run until temperature, pH or specific conductance stablized. Temperature measured with a mercury thermometer to nearest half degree in degrees F. Converted to degrees C for table.",
"Samples collected from spigot or faucet nearest to well. Water run until temperature, pH or specific conductance stablized. Temperature measured with a mercury thermometer to nearest half degree in degrees F. Converted to degrees C for table.",
"Samples collected from spigot or faucet nearest to well. Water run until temperature, pH or specific conductance stablized. Temperature measured with a mercury thermometer to nearest half degree in degrees F. Converted to degrees C for table."
), CorrectionType = c(NA, NA, NA), DepthOfMeasurement = c(NA,
NA, NA), MeasurementDateTime = c(NA, NA, NA), MeasurementFormation = c(NA,
NA, NA), MeasurementSource = c("Walker, Eugene H., William W. Caswell, and S. William Wandle, Jr. Hydrologic Data of the Charles River Basin",
"Walker, Eugene H., William W. Caswell, and S. William Wandle, Jr. Hydrologic Data of the Charles River Basin",
"Walker, Eugene H., William W. Caswell, and S. William Wandle, Jr. Hydrologic Data of the Charles River Basin"
), CasingLogger = c(" Massachusetts\". USGS Massachusetts Hydrologic-Data Report No. 19 (1977): 1-57. Print. ftp://eclogite.geo.umass.edu/pub/stategeologist/Products/Geothermal/BoreholeTemperatureData/DataReport19.pdf\"",
" Massachusetts\". USGS Massachusetts Hydrologic-Data Report No. 19 (1977): 1-57. Print. ftp://eclogite.geo.umass.edu/pub/stategeologist/Products/Geothermal/BoreholeTemperatureData/DataReport19.pdf\"",
" Massachusetts\". USGS Massachusetts Hydrologic-Data Report No. 19 (1977): 1-57. Print. ftp://eclogite.geo.umass.edu/pub/stategeologist/Products/Geothermal/BoreholeTemperatureData/DataReport19.pdf\""
), CasingDepthDriller = c("", "", ""), CasingPipeDiameter = c("",
"", ""), CasingWeight = c(NA, NA, NA), CasingWeightUnits = c(NA,
NA, NA), CasingThickness = c(NA, NA, NA), DrillingFluid = c(NA,
NA, NA), Salinity = c(NA, NA, NA), MudResisitivity = c(NA,
NA, NA), Density = c(NA, NA, NA), FluidLevel = c(NA, NA,
NA), pH = c(NA, NA, NA), Viscosity = c(NA, NA, NA), FluidLoss = c(NA,
NA, NA), Unnamed..66 = c(NA, NA, NA), BitDiameterCollar = c(72,
48, 42), Unnamed..68 = c(NA, NA, NA), InformationSource = c("Stephen Mabee, MA State Geologist, University of Massachusetts, 611 North Pleasant Street, Amherst MA 01003 413-545-2285",
"Stephen Mabee, MA State Geologist, University of Massachusetts, 611 North Pleasant Street, Amherst MA 01003 413-545-2285",
"Stephen Mabee, MA State Geologist, University of Massachusetts, 611 North Pleasant Street, Amherst MA 01003 413-545-2285"
)), .Names = c("ObservationURI", "WellName", "APINo", "HeaderURI",
"OtherID", "OtherName", "BoreholeName", "Operator", "LeaseOwner",
"LeaseNo", "SpudDate", "EndedDrillingDate", "WellType", "Status",
"CommodityOfInterest", "StatusDate", "Function", "Production",
"Field", "County", "State", "PLSS_Meridians", "TWP", "RGE", "Section_",
"SectionPart", "Parcel", "UTM_E", "UTM_N", "LatDegree", "LongDegree",
"SRS", "LocationUncertaintyStatement", "DrillerTotalDepth", "LengthUnits",
"WellBoreShape", "TrueVerticalDepth", "ElevationGL", "BitDiameterTD",
"DiameterUnits", "Notes", "MeasuredTemperature", "CorrectedTemperature",
"TemperatureUnits", "TimeSinceCirculation", "CirculationDuration",
"MeasurementProcedure", "CorrectionType", "DepthOfMeasurement",
"MeasurementDateTime", "MeasurementFormation", "MeasurementSource",
"CasingLogger", "CasingDepthDriller", "CasingPipeDiameter", "CasingWeight",
"CasingWeightUnits", "CasingThickness", "DrillingFluid", "Salinity",
"MudResisitivity", "Density", "FluidLevel", "pH", "Viscosity",
"FluidLoss", "Unnamed..66", "BitDiameterCollar", "Unnamed..68",
"InformationSource"), row.names = c(NA, 3L), class = "data.frame")`
We can read the datasets in a list with fread and use rbindlist from data.table with fill = TRUE and idcol argument to create a single data.table object. The fill = TRUE ensure that NA elements are created for those datasets that have lesser number of columns.
library(data.table)
#get the files from the working directory
files <- list.files(pattern = ".csv")
#read files in a loop with fread and then rbind the data.tables
rbindlist(lapply(files, fread), fill = TRUE, idcol = "grp")

Resources