Concatenating the strings of selected rows for every column - r

My data is as follows:
DF <- structure(list(toberevised = c("[Money amounts are in thousands of dollars]",
NA, NA, NA, "Item", NA, NA, NA, NA, "Number of returns", "Number of joint returns",
"Number with paid preparer's signature", "Number of exemptions",
"Adjusted gross income (AGI) [3]", "Salaries and wages in AGI: [4] Number",
"Salaries and wages in AGI: Amount", "Taxable interest: Number",
"Taxable interest: Amount", "Ordinary dividends: Number", "Ordinary dividends: Amount"
), ...2 = c("UNITED STATES [2]", NA, NA, NA, "All returns", NA,
NA, "1", NA, "135257620", "52607676", "80455243", "273738434",
"7364640131", "114060887", "5161583318", "59553985", "161324824",
"31158675", "164247298"), ...3 = c(NA, NA, NA, NA, "Under", "$50,000 [1]",
NA, "2", NA, "92150166", "20743943", "53622647", "159649737",
"1797097083", "75422766", "1541276272", "28527550", "39043002",
"13174923", "23867893"), ...4 = c(NA, NA, "Size of adjusted gross income",
NA, "50000", "under", "75000", "3", NA, "18221115", "11329459",
"11025624", "44189517", "1119634632", "16299827", "896339313",
"10891905", "16353293", "5255958", "12810282"), ...5 = c(NA,
NA, NA, NA, "75000", "under", "100000", "4", NA, "10499106",
"8296546", "6260725", "28555195", "905336768", "9520214", "721137490",
"7636612", "12852148", "4095938", "11524298"), ...6 = c(NA, NA,
NA, NA, "100000", "under", "200000", "5", NA, "10797979", "9193700",
"6678965", "30919226", "1429575727", "9782173", "1083175205",
"9092673", "23160862", "5824522", "25842394"), ...7 = c(NA, NA,
NA, NA, "200000", "or more", NA, "6", NA, "3589254", "3044028",
"2867282", "10424759", "2112995921", "3035907", "919655038",
"3405245", "69915518", "2807334", "90202431")), row.names = c(NA,
-20L), class = c("tbl_df", "tbl", "data.frame"))
All I would like to do is concatenate for each column, rows 5, 6 and 7. I tried:
DF[,5:7] <- lapply(DF[,5:7], paste(DF[,5:7],collapse=" "))
But I get the error:
Error in get(as.character(FUN), mode = "function", envir = envir) :
variable names are limited to 10000 bytes
This happens even when I concatenate one row it with another empty row instead (which obviously should not be much more bytes)!

lapply(DF[5:7, ], paste, collapse=" ")

Related

How do I create a line graph using multiple variables when the multiple variables are all in the same column?

structure(list(Sample.Id = c(NA, "2", "2", "2", "2", "2", "2",
"2", "2", "2", "2", "3", "3", "3", "3", "3", "3", "3", "3", "3"
), Sampling..Date = c(NA, "08-Sep-14", "14-Oct-14", "02-Nov-14",
"21-Nov-14", "03-Dec-14", "15-Dec-14", "11-Jan-15", "08-Feb-15",
"01-Mar-15", "06-Apr-15", "03-Sep-14", "08-Sep-14", "14-Oct-14",
"02-Nov-14", "21-Nov-14", "03-Dec-14", "15-Dec-14", "11-Jan-15",
"26-Jan-15"), Tot.P = c("µg/ml", "0.002", "0.017", "0.035",
"0.04", "0.059", "0.155", "0.021", "0.022", "0.025", "<0.009",
"0.021", "0.003", "0.036", "0.141", "0.041", "0.044", "0.01",
"0.023", "0.016"), DOC = c("µg/ml", NA, "12.3", "13.4", "12.5",
"9.9", "14.7", "8.8", "8.3", "0.026", "7.5", "13.4", NA, "14.6",
"16.6", "14.7", "12.6", "12.6", "10.6", "11.4"), Tot.N = c("µg/ml",
NA, "3.63", "4.12", "3.98", "4.08", "3.38", "3.63", "4.88", "8.3",
"2.74", "2.48", NA, "3.07", "3.38", "3.3", "3.43", "2.19", "2.77",
"4.25"), DOC.1 = c("µg/ml", "13.6", NA, NA, NA, NA, NA, NA,
NA, NA, NA, "14.44", "16.85", NA, NA, NA, NA, NA, NA, NA), Tot.P.1 = c("µg/ml",
"0.053", NA, NA, NA, NA, NA, NA, NA, NA, NA, "0.08", "0.071",
NA, NA, NA, NA, NA, NA, NA), Total.N = c("µg/ml", "3.363", NA,
NA, NA, NA, NA, NA, NA, NA, NA, "2.645", "2.637", NA, NA, NA,
NA, NA, NA, NA)), row.names = c(NA, 20L), class = "data.frame"
I have a set of water quality data from 2014-2022 over different sites and different time periods. Each site has a different monitoring period and the data was analysed using two different devices of which there are only two periods of overlap where the samples were analysed using both machines. I am trying to plot a time series showing the P, N and DOC across each site over time and shade in the areas where one machine was used instead of another. This is all a bit complicated and I am so new to R so have been running in circles for a week. My problem is I am unsure how to select the section of a column I need to create the variable I want so it makes sense.
I have tried to look it up on blogs but can't seem to mash the different pieces of advice together to make it work. Any tips would be much appreciated. Here is the data that I'm on about.
You will definitely need to clean up your data to fit this solution, but your basic way about this is pivoting from wide to long form.
Then you need to ensure that your dates are the propper POSIXct format.
Then it is just a matter of grouping by your relevant variables and plotting with geom_line()
I added the facet_grid to separate by Sample.Id.
library(tidyverse)
#> Warning: pakke 'ggplot2' blev bygget under R version 4.2.2
#> Warning: pakke 'tidyr' blev bygget under R version 4.2.2
#> Warning: pakke 'purrr' blev bygget under R version 4.2.2
#> Warning: pakke 'dplyr' blev bygget under R version 4.2.2
#> Warning: pakke 'stringr' blev bygget under R version 4.2.2
#> Warning: pakke 'forcats' blev bygget under R version 4.2.2
df <- structure(list(Sample.Id = c("2", "2", "2", "2", "2", "2", "2",
"2", "2", "2", "3", "3", "3", "3", "3", "3", "3", "3", "3"),
Sampling..Date = c("08-Sep-14", "14-Oct-14", "02-Nov-14",
"21-Nov-14", "03-Dec-14", "15-Dec-14", "11-Jan-15", "08-Feb-15",
"01-Mar-15", "06-Apr-15", "03-Sep-14", "08-Sep-14", "14-Oct-14",
"02-Nov-14", "21-Nov-14", "03-Dec-14", "15-Dec-14", "11-Jan-15",
"26-Jan-15"), Tot.P = c("0.002", "0.017", "0.035", "0.04",
"0.059", "0.155", "0.021", "0.022", "0.025", "<0.009", "0.021",
"0.003", "0.036", "0.141", "0.041", "0.044", "0.01", "0.023",
"0.016"), DOC = c(NA, "12.3", "13.4", "12.5", "9.9", "14.7",
"8.8", "8.3", "0.026", "7.5", "13.4", NA, "14.6", "16.6",
"14.7", "12.6", "12.6", "10.6", "11.4"), Tot.N = c(NA, "3.63",
"4.12", "3.98", "4.08", "3.38", "3.63", "4.88", "8.3", "2.74",
"2.48", NA, "3.07", "3.38", "3.3", "3.43", "2.19", "2.77",
"4.25"), DOC.1 = c("13.6", NA, NA, NA, NA, NA, NA, NA, NA,
NA, "14.44", "16.85", NA, NA, NA, NA, NA, NA, NA)), row.names = 2:20, class = "data.frame")
df |>
mutate(Tot.P = str_replace(Tot.P, "<", ""),
across(Tot.P:DOC.1, as.numeric),
Sampling..Date = as.POSIXct(Sampling..Date, format = "%d-%b-%y")) |>
select(-c(DOC.1)) |>
pivot_longer(cols = c(Tot.P, DOC, Tot.N)) |>
ggplot(aes(x = Sampling..Date, y = value, group = name, col = name)) +
geom_line() +
facet_grid(~Sample.Id)
#> Warning: Removed 5 rows containing missing values (`geom_line()`).
Created on 2023-02-14 with reprex v2.0.2

How to bring column name from wide dataset as row in long dataset if specified value corresponded with row using R

the input dataset shows a "wide" dataset that includes unique actors and next to their name are corresponding movies as column name with a 1 assigned if movie corresponds to actors portfolio.
structure(list(Actor = c("Brad Pitt", "Matt Damon", "Leonardo Dicaprio",
"Kate Winslet", "Jennifer Connoley", "Jude Law", "Gwenyth Paltrow"
), `Once upon a time in america` = c(NA, NA, NA, NA, 1, NA, NA
), `The Departed` = c(NA, 1, 1, NA, NA, NA, NA), `Once Upon a time in Hollywood` = c(1,
NA, 1, NA, NA, NA, NA), `the holiday` = c(NA, NA, NA, 1, NA,
1, NA), titanic = c(NA, NA, 1, 1, NA, NA, NA), contagion = c(NA,
1, NA, 1, NA, 1, 1), `the talented mr ripley` = c(NA, 1, NA,
NA, NA, 1, 1), `Oceans Eleven` = c(1, 1, NA, NA, NA, NA, NA),
`Blood Diamond` = c(NA, NA, 1, NA, 1, NA, NA)), class = c("tbl_df",
"tbl", "data.frame"), row.names = c(NA, -7L))
What I would like to do is to create a "long" dataset that shows actor and their corresponding movie by title in the following row if there was a 1 previously assigned under the movie title column. Below is how i'd like to see the output.
structure(list(Actor = c("Brad Pitt", "Brad Pitt", "Matt Damon",
"Matt Damon", "Matt Damon", "Leonardo Dicaprio", "Leonardo Dicaprio",
"Leonardo Dicaprio", "Leonardo Dicaprio", "Kate Winslet", "Kate Winslet",
"Kate Winslet", "Jennifer Connoley", "Jennifer Connoley", "Jude Law",
"Jude Law", "Jude Law", "Gwenyth Paltrow", "Gwenyth Paltrow"),
Movie = c("Once Upon a time in Hollywood", "Oceans Eleven",
"The Departed", "Contagion", "The Talented MR Ripley", "The Departed",
"Once Upon a time in Hollywood", "Titanic", "Blood Diamond",
"The Holiday", "Titanic", "Contagion", "Once Upon a time in America",
"Blood Diamond", "The Holiday", "Contagion", "The Talented MR Ripley",
"Contagion", "The Talented MR Ripley")), class = c("tbl_df",
"tbl", "data.frame"), row.names = c(NA, -19L))
just use pivot_longer() and filter() from tidyverse
library(tidyverse)
data %>% pivot_longer(!Actor,names_to="Movie",values_to="value") %>% dplyr::filter(!is.na(value))

Using gsub for removing unwanted characters : facing issues

df$Claim_Value <- gsub("Rs.", "", df$`Total Amount Claimed`)
checked class(df$Total Amount Claimed): showing numeric
will gsub work for numeric column ?
Here df$'Total Amount Claimed' is a column which has amount with the text Rs.
For example : Rs.200000. Trying to remove Rs. from this column. so used gsub. Its working but showing amount in thousands and not in lakhs.
How to show amount in lakhs
structure(list(Approver = c("Amarjeet Singh", "Amit Barot", "Amit Barot",
"Amit Barot", "Amit Barot", "Amit Barot"), `Assigned To` = c("SOLUTIONS.MOHALI#PALENGINEERS.IN",
"CAMC2#SUDHIRGROUP.COM", "CAMC2#SUDHIRGROUP.COM", "CAMC2#SUDHIRGROUP.COM",
"CAMC2#SUDHIRGROUP.COM", "CAMC2#SUDHIRGROUP.COM"), `Resolution Date` = structure(c(1609341652,
1574165400, 1591818814, 1592327216, 1592397052, 1592496000), class = c("POSIXct",
"POSIXt"), tzone = "UTC"), `Allow Submit Till Date` = structure(c(NA,
1589414400, NA, NA, NA, NA), class = c("POSIXct", "POSIXt"), tzone = "UTC"),
`Amt App by CSS (Without Tax)` = c(NA, NA, NA, NA, NA, NA
), `ESN/Alternator No.` = c("AAG8045S126087", "84846607",
"22321621", "191014875", "25452001", "78939252"), `Auto Approved` = c("No",
"No", "No", "No", "No", "No"), BIS = c("N", "N", "N", "Y",
"Y", "N"), `Batch Amount` = c(NA, NA, NA, NA, NA, NA), `Batch Date` = c(NA,
NA, NA, NA, NA, NA), `Batch Number` = c(NA, NA, NA, NA, NA,
NA), `Category Of Service` = c("Maintenance or repair service",
"Maintenance or repair service", "Maintenance or repair service",
"Maintenance or repair service", "Maintenance or repair service",
"Maintenance or repair service"), `Claim Scope` = c("In Scope",
"In Scope", "In Scope", "In Scope", "In Scope", "In Scope"
), `Claim Type` = c("WARRANTY", "WARRANTY", "WARRANTY", "WARRANTY",
"WARRANTY", "WARRANTY"), `Customer Name` = c("SUDHIR POWER LIMITED",
"BHARGAV EARTH MOVERS", "WAGAD INFRA PROJECT PVT LTD", "SUDHIR POWER LIMITED",
"SUDHIR POWER LIMITED", "CORE MULTI SERVICE"), `Final Amount Approved...16` = c(NA,
NA, NA, NA, NA, NA), `Division Name` = c("Pal Engineers - Mohali",
"Sudhir (Ahmedabad) - Rajkot", "Sudhir Sales & Services Limited, Ahmedabad",
"Sudhir Sales & Services Limited, Ahmedabad", "Sudhir Sales & Services Limited, Ahmedabad",
"Sudhir Sales & Services Limited, Ahmedabad"), `Failure Type` = c("Warranty Failure",
"Warranty Failure", "Warranty Failure", "Warranty Failure",
"Warranty Failure", "Warranty Failure"), `GIEA Agreement Name` = c(NA,
NA, NA, NA, NA, NA), `Cummins Invoice Num` = c(NA, NA, NA,
NA, NA, NA), Agreement = c(NA, NA, NA, NA, NA, NA), `Problem Summary` = c("Electrical issue / PCC Controller issues / Starter / alternator issue / Battery",
"Engine not starting / Tripping / Not stopping", "Maintenance / General Check",
"Engine not starting / Tripping / Not stopping", "Engine not starting / Tripping / Not stopping",
"Leakages - Oil/ Fuel/ Coolant / Air"), `Resolution Summary` = c("After recharge battery tested and failed on load test replaced battery warranty",
"REPAIRED THE FUEL PUMP TAKEN TRAIL ALL PARAMETER LIMIT",
"Last service done by 23/12/2019 at 724 hours qt this time change air filter also.today service done at 974 hours.in between customer says top up oil 2.5 ltr then start the engine running ok now all parameters within limits.",
"attend site check & found starter loose connection then correct it & Suggested to customer requests load balances and require proper ventilation for dg set suction and discharge air .",
"ATTEND THE SITE OBSERVE ENGINE FOUND FAULT SHUTDOWN ERROR NEED TO VISIT OEM SIDE",
"ATTEND SITE CHECK & FOUND FUEL LEAKAGE FROM BLEIND PLUG THEN REMOVED IT & FITMENT GAIN & START ENGINE & FOUND ENGINE RUNNING WITHIN LIMIT.."
), `Claim Rejected` = c("Y", "Y", "Y", "Y", "Y", "Y"), `SR Number` = c("SR-PE-MO-2021-006884",
"SR-SU-RJ-1920-002793", "SR-SU-AH-2021-000683", "SR-SU-AH-2021-000857",
"SR-SU-AH-2021-000865", "SR-SU-AH-2021-000913"), `Service Type` = c(NA,
NA, NA, NA, NA, NA), `Sub Type` = c(NA, NA, NA, NA, NA, NA
), `Amount Claimed By Dealer` = c("Rs.5,721.00", "Rs.19,087.00",
"Rs.1,166.00", "Rs.836.00", "Rs.1,034.00", "Rs.2,057.00"),
`Processed By...29` = c("SOLUTIONS.MOHALI#PALENGINEERS.IN",
"CAMC2#SUDHIRGROUP.COM", "CAMC2#SUDHIRGROUP.COM", "CAMC2#SUDHIRGROUP.COM",
"CAMC2#SUDHIRGROUP.COM", "CAMC2#SUDHIRGROUP.COM"), `Claim #` = c("1-5W4ZZVR",
"1-5PWNEAT", "1-5QQ4Z4J", "1-5QWC86P", "1-5QXPYU1", "1-5QXU7VN"
), `Claim Category` = c("STANDARD", "STANDARD", "STANDARD",
"STANDARD", "STANDARD", "STANDARD"), `Claim Creation Date` = structure(c(1609844392,
1588360803, 1591890038, 1592481430, 1592577627, 1592582659
), class = c("POSIXct", "POSIXt"), tzone = "UTC"), `Created By` = c("1-5LS00O1",
"1-2CD07UT", "1-2CD07UT", "1-2CD07UT", "1-2CD07UT", "1-2CD07UT"
), `Currency Code` = c("INR", "INR", "INR", "INR", "INR",
"INR"), Partner = c(NA, NA, NA, NA, NA, NA), `Final Amount Approved...36` = c("Rs.0.00",
"Rs.0.00", "Rs.0.00", "Rs.0.00", "Rs.0.00", "Rs.0.00"), `Fund Req Category` = c(NA,
NA, NA, NA, NA, NA), Comments = c(NA, NA, NA, NA, NA, NA),
`Claim Name` = c("CLM-PE-MO-2021-002442", "CLM-SU-RJ-2021-000055",
"CLM-SU-AH-2021-000527", "CLM-SU-AH-2021-000627", "CLM-SU-AH-2021-000641",
"CLM-SU-AH-2021-000643"), `Organization Name` = c("Pal Engineers, Jammu",
"Sudhir Sales & Services Limited, Ahmedabad, AHMEDABAD",
"Sudhir Sales & Services Limited, Ahmedabad, AHMEDABAD",
"Sudhir Sales & Services Limited, Ahmedabad, AHMEDABAD",
"Sudhir Sales & Services Limited, Ahmedabad, AHMEDABAD",
"Sudhir Sales & Services Limited, Ahmedabad, AHMEDABAD"),
Period = c(NA, NA, NA, NA, NA, NA), `Pre-Approval #` = c(NA,
NA, NA, NA, NA, NA), `Processed By...43` = c("SOLUTIONS.MOHALI#PALENGINEERS.IN",
"CAMC2#SUDHIRGROUP.COM", "CAMC2#SUDHIRGROUP.COM", "CAMC2#SUDHIRGROUP.COM",
"CAMC2#SUDHIRGROUP.COM", "CAMC2#SUDHIRGROUP.COM"), `Program Account Name` = c(NA,
NA, NA, NA, NA, NA), `Program Name` = c(NA, NA, NA, NA, NA,
NA), `Promotion Name` = c("BTRY_CHANDIGARH", "CIL_20000",
"CIC_5000", "Warranty_BIS_RECON", "Warranty_BIS_RECON", "CIC_5000"
), Description = c(NA, NA, NA, NA, NA, NA), Status = c("Pending",
"Pending", "Pending", "Pending", "Pending", "Pending"), `Final Approval Date` = c(NA,
NA, NA, NA, NA, NA), `Submitted By` = c("SOLUTIONS.MOHALI#PALENGINEERS.IN",
"WARRANTY.AHD#SUDHIRGROUP.COM", "WARRANTY.AHD#SUDHIRGROUP.COM",
"WARRANTY.AHD#SUDHIRGROUP.COM", "WARRANTY.AHD#SUDHIRGROUP.COM",
"WARRANTY.AHD#SUDHIRGROUP.COM"), `Total Amount Approved` = c(0,
0, 0, 0, 0, 0), `Total Amount Claimed` = c("Rs.5,721.00",
"Rs.19,087.00", "Rs.1,166.00", "Rs.836.00", "Rs.1,034.00",
"Rs.2,057.00"), `Total Participation Amount` = c(NA, NA,
NA, NA, NA, NA), Updated = structure(c(1610113437, 1589227258,
1591896091, 1592491326, 1592645576, 1592839702), class = c("POSIXct",
"POSIXt"), tzone = "UTC"), `Updated By` = c("1-4YO9LU", "1-2QTU4R",
"1-SDR5", "1-SDRU", "1-SDRU", "1-SDR5"), `Resolved By FSL` = c("Y",
"N", "Y", "Y", "N", "N"), `Parts Warranty Claim` = c(NA,
NA, NA, NA, NA, NA), `Inbox Last Updated` = structure(c(1610113437,
1589227258, 1591896091, 1592491326, 1592645576, 1592839702
), class = c("POSIXct", "POSIXt"), tzone = "UTC"), `Claim Submitted Date` = structure(c(1609939043,
NA, NA, NA, NA, NA), class = c("POSIXct", "POSIXt"), tzone = "UTC"),
`Claim Rejection Reason` = c("Incorrect/Missing Commercial Bills",
"Incorrect/Missing Technical Documents", "HCS or KAM Approval Required",
"Incorrect/Missing Technical Documents", "Incorrect/Missing Technical Documents",
"Incorrect/Missing Technical Documents"), `Claim Categorization Reason` = c(NA,
NA, NA, NA, NA, NA), Aging = c(2.4278125, 244.16599537037,
213.276724537037, 206.387430555556, 204.60212962963, 202.355300925926
), AgeGroup = structure(c(2L, 6L, 6L, 6L, 6L, 6L), .Label = c("0-1 Days",
"2-4 Days", "5-7 Days", "8-15 Days", "16-30 Days", ">30 Days"
), class = "factor"), Zones = c(NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_
), Approver.y = c(NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_), Claim_Value = c(NA,
NA, NA, 836L, NA, NA)), row.names = c(NA, 6L), class = "data.frame")
The following should work fine.
as.numeric(gsub("Rs.", "", "Rs 2000"))
provided df$`Total Amount Claimed` column is character type and not a factor type.
For showing in lakhs and not in exponential format, use the option
options("scipen"=100, "digits"=4)
You can turn the values to numeric by using gsub in the folowing way :
df$`Total Amount Claimed`
#[1] "Rs.5,721.00" "Rs.19,087.00" "Rs.1,166.00" "Rs.836.00" "Rs.1,034.00" "Rs.2,057.00"
df$Claim_Value <- as.numeric(gsub('Rs\\.|,', '', df$`Total Amount Claimed`))
df$Claim_Value
#[1] 5721 19087 1166 836 1034 2057

R - object not found error when it exists

I am trying to understand why "R" cannot find a variable that is definitely in my dataframe.
Here is the dput for "DF.1" in my code below:
library("dplyr")
library("stringr")
DF.1 <- structure(list(`ID` = c("APP-5XUEJHC1XN-2019",
"APP-AVO1K5F33B-2019", "APP-J12JZHOWTM-2019", "APP-VROJDQSZ3P-2019",
"APP-00AURK6GEP-2019", "APP-00VACS4YZI-2018", "APP-00W7N0XXSO-2019",
"APP-01AQMLSHX6-2019", "APP-021R8JXC6O-2018", "APP-022XIXHHIQ-2019",
"APP-025ZNBC262-2018", "APP-02IUB6YJ05-2019", "APP-02PSFXZI1U-2019",
"APP-02TZN2M3JT-2019", "APP-034IPEAN7E-2018", "APP-03XWZT90ZW-2018",
"APP-040I2UPEEI-2019", "APP-0442F1YUCB-2019", "APP-04DKWB5EF3-2019",
"APP-04E58XMYDH-2018"), `Observations` = c("Single",
"Single", "Single", "Single", NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA)), class = c("tbl_df", "tbl",
"data.frame"), row.names = c(NA, -20L))
DF.2 <- DF.1 %>% dplyr::mutate(
"New Var" = case_when(
str_detect(tolower(`Observations`), "single") ~ "Single Protocol",
str_detect(tolower(`Observations`), "multiple") |
!(str_detect(tolower(`Observations`), paste(c("single", "multiple"), collapse = '|'))) |
is.na(`Observations`) ~ "Multiple Protocol"))
When I run the above code, I get the following error:
Error in eval_tidy(pair$lhs, env = default_env) :
object 'Observations' not found
The variable is in the dataframe, so I am wondering if there is a conflict with either case_when or str_detect.
you need to assign the structure(...) piece to an object (DF.1 <- ...):
DF.1 <- structure(list(`ID` = c("APP-5XUEJHC1XN-2019",
"APP-AVO1K5F33B-2019", "APP-J12JZHOWTM-2019", "APP-VROJDQSZ3P-2019",
"APP-00AURK6GEP-2019", "APP-00VACS4YZI-2018", "APP-00W7N0XXSO-2019",
"APP-01AQMLSHX6-2019", "APP-021R8JXC6O-2018", "APP-022XIXHHIQ-2019",
"APP-025ZNBC262-2018", "APP-02IUB6YJ05-2019", "APP-02PSFXZI1U-2019",
"APP-02TZN2M3JT-2019", "APP-034IPEAN7E-2018", "APP-03XWZT90ZW-2018",
"APP-040I2UPEEI-2019", "APP-0442F1YUCB-2019", "APP-04DKWB5EF3-2019",
"APP-04E58XMYDH-2018"), `Observations` = c("Single",
"Single", "Single", "Single", NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA)), class = c("tbl_df", "tbl",
"data.frame"), row.names = c(NA, -20L))

Using gsub (or similar) to extract from a vector and keep last 4 digits of column names

I am trying to use gsub or substr or anything similar to keep the column names which consist of symbols and a date. The symbols.f is stored in a vector (which can take on different ticker symbols);
symbols.f <- c("NVDA.f", "GOOG.f", "GE.f")
I then have the following colnames() from the dput() below.
[1] "GE.f.12.31.2017"
[2] "GE.f.12.31.2016"
[3] "GE.f.12.31.2015"
[4] "GE.f.12.31.2014"
[5] "GOOG.f.12.31.2017"
[6] "GOOG.f.12.31.2016"
[7] "GOOG.f.12.31.2015"
[8] "GOOG.f.12.31.2014"
[9] "NVDA.f.1.28.2018"
[10] "NVDA.f.1.29.2017"
[11] "NVDA.f.1.31.2016"
[12] "NVDA.f.1.25.2015"
What I am trying to do is to keep the ticker and also keep the yearor last 4 digits of the column names. So for example for the first two tickers;
[1] "GE2017"
[2] "GE2016"
[3] "GE2015"
[4] "GE2014"
[5] "GOOG2017"
[6] "GOOG2016"
[7] "GOOG2015"
[8] "GOOG2014"
I am able to extract the last 4 digits or all characters but cannot seem to do it jointly or in one go.
Data:
df <- structure(list(GE.f.12.31.2017 = c(18211000, NA, 46549000, 21923000,
5790000, 140110000, 38696000, 53874000, 83968000, 20273000, NA,
41024000, 6207000, 377945000, 15153000, 134591000, 21400000,
61893000, 108575000, 82597000, NA, 21122000, NA, 292560000, NA,
NA, NA, 702000, 125682000, -62127000, NA, 22775000, 64257000,
-39984000), GE.f.12.31.2016 = c(10525000, NA, 42687000, 22354000,
2867000, 149029000, 44313000, 50518000, 68070000, 16436000, NA,
34449000, 1833000, 365183000, 14435000, 136211000, 20772000,
70364000, 105080000, 83040000, NA, 4688000, NA, 284667000, NA,
NA, NA, 702000, 139532000, -64412000, NA, 18626000, 75822000,
-11052000), GE.f.12.31.2015 = c(10372000, NA, 43013000, 22515000,
5109000, 280896000, 31973000, 54095000, 65526000, 17797000, NA,
42784000, 3105000, 493071000, 13680000, 197602000, 27453000,
138270000, 144659000, 79175000, NA, 4836000, NA, 389961000, NA,
NA, NA, 702000, 140020000, -42454000, NA, 21085000, 98268000,
14945000), GE.f.12.31.2014 = c(15916000, NA, 23237000, 17639000,
6566000, 460743000, 35505000, 48070000, 53207000, 13182000, NA,
44247000, 6183000, 654954000, 12067000, 261424000, 18203000,
229564000, 186596000, 70801000, NA, 8772000, NA, 518023000, NA,
NA, NA, 702000, 155333000, -27876000, NA, 14717000, 128159000,
61770000), GOOG.f.12.31.2017 = c(10715000, 91156000, 18705000,
749000, 2983000, 124308000, 7813000, 42383000, 16747000, 2692000,
NA, 3352000, 680000, 197295000, 3137000, 3969000, 10651000, 24183000,
3943000, 16641000, NA, NA, NA, 44793000, NA, NA, NA, 40247000,
113247000, -992000, NA, -992000, 152502000, 133063000), GOOG.f.12.31.2016 = c(12918000,
73415000, 15632000, 268000, 3175000, 105408000, 5878000, 34234000,
16468000, 3307000, NA, 2202000, 383000, 167497000, 2041000, 3935000,
5851000, 16756000, 3935000, 7770000, NA, NA, NA, 28461000, NA,
NA, NA, 36307000, 105131000, -2402000, NA, -2402000, 139036000,
119261000), GOOG.f.12.31.2015 = c(15409000, 56517000, 13459000,
491000, 1590000, 90114000, 5183000, 29016000, 15869000, 3847000,
NA, 3432000, 251000, 147461000, 1931000, 7648000, 4327000, 19310000,
1995000, 5825000, NA, NA, NA, 27130000, NA, NA, NA, 32982000,
89223000, -1874000, NA, -1874000, 120331000, 100615000), GOOG.f.12.31.2014 = c(16585000,
46048000, 9974000, NA, 2637000, 78656000, 3079000, 23883000,
15599000, 4607000, NA, 3363000, 176000, 129187000, 1715000, 8015000,
2803000, 16779000, 2992000, 5320000, NA, NA, NA, 25327000, NA,
NA, NA, 28767000, 75066000, 27000, NA, 27000, 103860000, 83654000
), NVDA.f.1.28.2018 = c(7108000, NA, 1265000, 796000, NA, 9255000,
NA, 997000, 618000, 52000, NA, 319000, NA, 11241000, 596000,
2e+06, NA, 1153000, 1985000, 632000, NA, NA, NA, 3770000, NA,
NA, NA, 7471000, NA, NA, NA, NA, 7471000, 6801000), NVDA.f.1.29.2017 = c(1766000,
5032000, 826000, 794000, NA, 8536000, NA, 521000, 618000, 104000,
NA, 62000, NA, 9841000, 485000, 2791000, 325000, 1788000, 1985000,
3e+05, NA, NA, NA, 4079000, NA, NA, NA, 1000, 6108000, -5055000,
4708000, -16000, 5762000, 5040000), NVDA.f.1.31.2016 = c(596000,
4441000, 505000, 418000, NA, 6053000, NA, 466000, 618000, 166000,
NA, 67000, NA, 7370000, 296000, 1434000, 532000, 2351000, 7000,
533000, NA, NA, NA, 2901000, NA, NA, NA, 1000, 4350000, -4052000,
4170000, -4000, 4469000, 3685000), NVDA.f.1.25.2015 = c(497000,
4126000, 474000, 483000, 63000, 5713000, NA, 557000, 618000,
222000, NA, 91000, NA, 7201000, 293000, 1398000, 471000, 896000,
1384000, 489000, NA, NA, NA, 2783000, NA, NA, NA, 1000, 3949000,
-3387000, 3855000, 8000, 4418000, 3578000)), .Names = c("GE.f.12.31.2017",
"GE.f.12.31.2016", "GE.f.12.31.2015", "GE.f.12.31.2014", "GOOG.f.12.31.2017",
"GOOG.f.12.31.2016", "GOOG.f.12.31.2015", "GOOG.f.12.31.2014",
"NVDA.f.1.28.2018", "NVDA.f.1.29.2017", "NVDA.f.1.31.2016", "NVDA.f.1.25.2015"
), row.names = c("Cash And Cash Equivalents", "Short Term Investments",
"Net Receivables", "Inventory", "Other Current Assets", "Total Current Assets",
"Long Term Investments", "Property Plant and Equipment", "Goodwill",
"Intangible Assets", "Accumulated Amortization", "Other Assets",
"Deferred Long Term Asset Charges", "Total Assets", "Accounts Payable",
"Short/Current Long Term Debt", "Other Current Liabilities",
"Total Current Liabilities", "Long Term Debt", "Other Liabilities",
"Deferred Long Term Liability Charges", "Minority Interest",
"Negative Goodwill", "Total Liabilities", "Misc. Stocks Options Warrants",
"Redeemable Preferred Stock", "Preferred Stock", "Common Stock",
"Retained Earnings", "Treasury Stock", "Capital Surplus", "Other Stockholder Equity",
"Total Stockholder Equity", "Net Tangible Assets"), class = "data.frame")
Will this regex work?
gsub("\\..*\\.", "", colnames(df))
It removes the first and last '.' and everything in between.
#[1] "GE2017" "GE2016" "GE2015" "GE2014" "GOOG2017"
#[6] "GOOG2016" "GOOG2015" "GOOG2014" "NVDA2018" "NVDA2017"
#[11] "NVDA2016" "NVDA2015"
# '\\.' = match a dot, '.' = match anything, '*' = match the previous 0 or more times
# so \\..*\\. means "anything 0 or more times, preceded by a dot, followed by a dot")
# the \\ are escapes so the regex can differentiate whether you mean the
# expression '.' (anything) or '\\.' (actual dot)
Here is an alternative to the answer given by #Ape using sub with capture groups:
sub("^([^.]+).*?(\\d+)$", "\\1\\2", colnames(df))
Demo

Resources