Calculating the distance between specific rows contained in an array in R - r

I'm trying to calculate the distance between specific points contained in an array in R. My data looks like this:
curve_array
, , Frame001.txt
[,1] [,2]
[1,] 30.13947 -16.92239
[2,] 30.34071 -16.72115
[3,] 30.53260 -16.52926
[4,] 30.72348 -16.33496
[5,] 30.92572 -16.13614
[6,] 31.13358 -15.95087
[7,] 31.34144 -15.76561
[8,] 31.52396 -15.58309
[9,] 31.73182 -15.39782
[10,] 31.95565 -15.21917
[11,] 32.17287 -15.02455
, , Frame002.txt
[,1] [,2]
[1,] 30.13947 -16.92239
[2,] 30.34071 -16.72115
[3,] 30.53260 -16.52926
[4,] 30.72348 -16.33496
[5,] 30.92572 -16.13614
[6,] 31.13358 -15.95087
[7,] 31.34144 -15.76561
[8,] 31.52396 -15.58309
[9,] 31.73182 -15.39782
[10,] 31.95565 -15.21917
[11,] 32.17287 -15.02455
, , Frame003.txt
[,1] [,2]
[1,] 30.13947 -16.92239
[2,] 30.34071 -16.72115
[3,] 30.53260 -16.52926
[4,] 30.72348 -16.33496
[5,] 30.92572 -16.13614
[6,] 31.13358 -15.95087
[7,] 31.34144 -15.76561
[8,] 31.52396 -15.58309
[9,] 31.73182 -15.39782
[10,] 31.95565 -15.21917
[11,] 32.17287 -15.02455
And for each slice I am trying to calculate the distance between the points contained in row [1,] and row [11,] (the first and last points).
I have truly gotten nowhere with this (I've tried the dist function and the geomorph::interlmkdist function) so any help would be much appreciated. I most recently tried the usedist::dist_subset function but it showed the following error, 'Error in as.matrix(d)[idx, idx] : no 'dimnames' attribute for array'.
I've had success in using the distancePointToPoint function but have to manually input the values from rows [1,] and [11,] which given the extent of the array, is not ideal.
Ideally, I want to return an array that looks something like this:
,, Frame001.txt
[1] 2.781459
,, Frame002.txt
[1] 2.781459
,, Frame003.txt
[1] 2.781459
etc.
Thank you!
dput(curve_array)
structure(c(30.1394716184822, 30.3407126170086, 30.5325951613319,
30.7234753517486, 30.9257187041817, 31.1335771291367, 31.3414355540918,
31.5239596442118, 31.7318180691669, 31.9556523747537, 32.172869253912,
-16.9223869881883, -16.7211459896618, ...), dim = c(11L,
2L, 47L), dimnames = list(NULL, NULL, c("Frame001.txt", "Frame002.txt",
"Frame003.txt", "Frame004.txt", "Frame005.txt", "Frame006.txt",
"Frame007.txt", "Frame008.txt", "Frame009.txt", "Frame010.txt",
"Frame011.txt", "Frame012.txt", "Frame013.txt", "Frame014.txt",
"Frame015.txt", "Frame016.txt", "Frame017.txt", "Frame018.txt",
"Frame019.txt", "Frame020.txt", "Frame021.txt", "Frame022.txt",
"Frame023.txt", "Frame024.txt", "Frame025.txt", "Frame026.txt",
"Frame027.txt", "Frame028.txt", "Frame029.txt", "Frame030.txt",
"Frame031.txt", "Frame032.txt", "Frame033.txt", "Frame034.txt",
"Frame035.txt", "Frame036.txt", "Frame037.txt", "Frame038.txt",
"Frame039.txt", "Frame040.txt", "Frame041.txt", "Frame042.txt",
"Frame043.txt", "Frame044.txt", "Frame045.txt", "Frame046.txt",
"Frame047.txt")))

You can use apply along the third margin to apply the distance operation to each slice of your array. This is just a simple Euclidian distance function between the first and 11th rows. The output is a named vector:
apply(curve_array, 3, function(x) sqrt((x[1, 1]-x[11, 1])^2 + (x[1, 2]-x[11, 2])^2))
#> Frame001.txt Frame002.txt Frame003.txt Frame004.txt Frame005.txt
#> 2.781455 2.781455 2.781455 2.781455 2.668020
#> Frame006.txt Frame007.txt Frame008.txt Frame009.txt Frame010.txt
#> 2.548641 2.350681 2.121847 1.791864 1.446678
#> Frame011.txt Frame012.txt Frame013.txt Frame014.txt Frame015.txt
#> 1.192961 1.054892 1.074074 1.182647 1.403697
#> Frame016.txt Frame017.txt Frame018.txt Frame019.txt Frame020.txt
#> 1.644818 1.889481 2.036533 2.155975 2.240272
#> Frame021.txt Frame022.txt Frame023.txt Frame024.txt Frame025.txt
#> 2.380460 2.484956 2.530551 2.537903 2.557738
#> Frame026.txt Frame027.txt Frame028.txt Frame029.txt Frame030.txt
#> 2.580983 2.659684 2.638101 2.697796 2.698458
#> Frame031.txt Frame032.txt Frame033.txt Frame034.txt Frame035.txt
#> 2.700066 2.693820 2.727060 2.718718 2.631126
#> Frame036.txt Frame037.txt Frame038.txt Frame039.txt Frame040.txt
#> 2.702522 2.744878 2.787322 2.758420 2.801023
#> Frame041.txt Frame042.txt Frame043.txt Frame044.txt Frame045.txt
#> 2.772264 2.772264 2.793633 2.750915 2.686989
#> Frame046.txt Frame047.txt
#> 2.715900 2.665723

With some subsetting apply() works well with dist() too:
apply(curve_array[c(1,11),,], 3, dist)
#> Frame001.txt Frame002.txt Frame003.txt Frame004.txt Frame005.txt Frame006.txt
#> 2.781455 2.781455 2.781455 2.781455 2.668020 2.548641
#> Frame007.txt Frame008.txt Frame009.txt Frame010.txt Frame011.txt Frame012.txt
#> 2.350681 2.121847 1.791864 1.446678 1.192961 1.054892
#> Frame013.txt Frame014.txt Frame015.txt Frame016.txt Frame017.txt Frame018.txt
#> 1.074074 1.182647 1.403697 1.644818 1.889481 2.036533
#> Frame019.txt Frame020.txt Frame021.txt Frame022.txt Frame023.txt Frame024.txt
#> 2.155975 2.240272 2.380460 2.484956 2.530551 2.537903
#> Frame025.txt Frame026.txt Frame027.txt Frame028.txt Frame029.txt Frame030.txt
#> 2.557738 2.580983 2.659684 2.638101 2.697796 2.698458
#> Frame031.txt Frame032.txt Frame033.txt Frame034.txt Frame035.txt Frame036.txt
#> 2.700066 2.693820 2.727060 2.718718 2.631126 2.702522
#> Frame037.txt Frame038.txt Frame039.txt Frame040.txt Frame041.txt Frame042.txt
#> 2.744878 2.787322 2.758420 2.801023 2.772264 2.772264
#> Frame043.txt Frame044.txt Frame045.txt Frame046.txt Frame047.txt
#> 2.793633 2.750915 2.686989 2.715900 2.665723

What do you mean by "distance"? Given your values I am guessing that these are geographic coordinates, and that you want geographic distance on a spheroid, not Euclidean distance.
If so, you could get the distance between the first and 11th row for one matrix like this
library(geosphere)
distGeo(curve_array[1,,1], curve_array[11,,1])
[1] 302457.3
Assuming that the data are in longitude(x)/latitude(y) order, and not the other way around!
And for the whole lot like this
a <- apply(curve_array, 3, \(m) distGeo(m[1,], m[11,]))
head(a)
#Frame001.txt Frame002.txt Frame003.txt Frame004.txt Frame005.txt Frame006.txt
# 302457.3 302457.3 302457.3 302457.3 290466.7 278119.9
For Euclidean distance you could use
b <- apply(curve_array, 3, \(m) dist(m[c(1,11), ]))
head(b)
#Frame001.txt Frame002.txt Frame003.txt Frame004.txt Frame005.txt Frame006.txt
# 2.781455 2.781455 2.781455 2.781455 2.668020 2.548641
Which is the same result as in Allan Cameron's answer

Related

How to obtain some points located around a points with GPS coordinates in R?

I am quite new to cartography operations with R. I would like to know if the there is a way to find some points located around a given GPS point.
Imagine that I have a data.table as follows:
Reference_Coordinates <- data.table(Latitude=c(1,2,3), Longitude=c(4,5,6), Point_ID=c("Point1","Point2","Point3")).
If I assume that each point is the centre of a circle, how could I get some points located at equal distance from the origin? For instance, 10 points forming a 1-km radius circle around point 1, then another 10 points around point 2, et cetera? I wanted to use a WGS 84 standard ellipsoid.
Thank you very much.
I've read some answers in this forum and I supose I could use some packages such as sf or sp, but I couldn't find the solution to my question.
Using the R package terra, you can create 3 points from the lat / lon co-ordinates like this:
library(terra)
v <-vect(cbind(Reference_Coordinates$Longitude, Reference_Coordinates$Latitude),
type = 'points', crs = 'WGS84')
To get a collection of points 10km from each starting point, you can do:
p <- as.points(buffer(v, 10000, quadsegs = 3))
Which looks like this:
plot(p, xlab = 'longitude', ylab = 'latitude')
To get the co-ordinates of each of these points, you can simply do
crds(p)
#> x y
#> [1,] 4.000000 1.0904366
#> [2,] 4.044924 1.0783201
#> [3,] 4.077809 1.0452174
#> [4,] 4.089845 0.9999988
#> [5,] 4.077807 0.9547807
#> [6,] 4.044922 0.9216792
#> [7,] 4.000000 0.9095633
#> [8,] 3.955078 0.9216792
#> [9,] 3.922193 0.9547807
#> [10,] 3.910155 0.9999988
#> [11,] 3.922191 1.0452174
#> [12,] 3.955076 1.0783201
#> [13,] 5.000000 2.0904358
#> [14,] 5.044945 2.0783191
#> [15,] 5.077846 2.0452160
#> [16,] 5.089886 1.9999975
#> [17,] 5.077841 1.9547802
#> [18,] 5.044941 1.9216796
#> [19,] 5.000000 1.9095641
#> [20,] 4.955059 1.9216796
#> [21,] 4.922159 1.9547802
#> [22,] 4.910114 1.9999975
#> [23,] 4.922154 2.0452160
#> [24,] 4.955055 2.0783191
#> [25,] 6.000000 3.0904344
#> [26,] 6.044980 3.0783175
#> [27,] 6.077906 3.0452144
#> [28,] 6.089954 2.9999963
#> [29,] 6.077899 2.9547800
#> [30,] 6.044974 2.9216805
#> [31,] 6.000000 2.9095655
#> [32,] 5.955026 2.9216805
#> [33,] 5.922101 2.9547800
#> [34,] 5.910046 2.9999963
#> [35,] 5.922094 3.0452144
#> [36,] 5.955020 3.0783175
Created on 2023-01-31 with reprex v2.0.2

How to fit an ellipse to a series of points in R?

I am trying to fit an ellipse to a series of landmark points in R. The data is contained within an array and located at row 6. Here is a sample of the data:
, , Frame001.txt
[,1] [,2]
[1,] 30.13947 -16.92239
[2,] 30.34071 -16.72115
[3,] 30.53260 -16.52926
[4,] 30.72348 -16.33496
[5,] 30.92572 -16.13614
[6,] 31.13358 -15.95087
[7,] 31.34144 -15.76561
[8,] 31.52396 -15.58309
[9,] 31.73182 -15.39782
[10,] 31.95565 -15.21917
[11,] 32.17287 -15.02455
, , Frame002.txt
[,1] [,2]
[1,] 30.13947 -16.92239
[2,] 30.34071 -16.72115
[3,] 30.53260 -16.52926
[4,] 30.72348 -16.33496
[5,] 30.92572 -16.13614
[6,] 31.13358 -15.95087
[7,] 31.34144 -15.76561
[8,] 31.52396 -15.58309
[9,] 31.73182 -15.39782
[10,] 31.95565 -15.21917
[11,] 32.17287 -15.02455
, , Frame003.txt
[,1] [,2]
[1,] 30.13947 -16.92239
[2,] 30.34071 -16.72115
[3,] 30.53260 -16.52926
[4,] 30.72348 -16.33496
[5,] 30.92572 -16.13614
[6,] 31.13358 -15.95087
[7,] 31.34144 -15.76561
[8,] 31.52396 -15.58309
[9,] 31.73182 -15.39782
[10,] 31.95565 -15.21917
[11,] 32.17287 -15.02455
dput(array)
structure(c(30.1394716184822, 30.3407126170086, 30.5325951613319,
30.7234753517486, 30.9257187041817, 31.1335771291367, 31.3414355540918,
31.5239596442118, 31.7318180691669, 31.9556523747537, 32.172869253912,
-16.9223869881883, -16.7211459896618, -16.5292634453385, -16.3349610046196,
-16.1361399024888, -15.950874784594, -15.7656096666993, -15.5830855765793,
-15.3978204586845, -15.2191727672184, -15.0245491951204, 30.1394716184822,
30.3407126170086, 30.5325951613319, 30.7234753517486, 30.9257187041817,
31.1335771291367, 31.3414355540918, 31.5239596442118, 31.7318180691669,
31.9556523747537, 32.172869253912, -16.9223869881883, -16.7211459896618,
-16.5292634453385, -16.3349610046196, -16.1361399024888, -15.950874784594,
-15.7656096666993, -15.5830855765793, -15.3978204586845, -15.2191727672184,
-15.0245491951204, 30.1394716184822, 30.3407126170086, 30.5325951613319,
30.7234753517486, 30.9257187041817, 31.1335771291367, 31.3414355540918,
31.5239596442118, 31.7318180691669, 31.9556523747537, 32.172869253912,
-16.9223869881883, -16.7211459896618, -16.5292634453385, -16.3349610046196,
-16.1361399024888, -15.950874784594, -15.7656096666993, -15.5830855765793,
-15.3978204586845, -15.2191727672184, -15.0245491951204, 30.1394716184822,
30.3407126170086, 30.5325951613319, 30.7234753517486, 30.9257187041817,
31.1335771291367, 31.3414355540918, 31.5239596442118, 31.7318180691669,
31.9556523747537, 32.172869253912, -16.9223869881883, -16.7211459896618,
-16.5292634453385, -16.3349610046196, -16.1361399024888, -15.950874784594,
-15.7656096666993, -15.5830855765793, -15.3978204586845, -15.2191727672184,
-15.0245491951204, 30.2524381537838, 30.3531916700196, 30.5272071629858,
30.7171985365838, 30.8950904282038, 31.0823407740271, 31.2949254546854,
31.4994281378246, 31.7210031127054, 31.9271974411109, 32.1276826397913,
-16.9901669093693, -16.7538535507715, -16.5346514436846, -16.3220667630263,
-16.1441748714063, -15.956924525583, -15.766933151985, -15.5893818716287,
-15.4270311921735, -15.2476277008611, -15.0923291163014), dim = c(11L,
2L, 5L), dimnames = list(NULL, NULL, c("Frame001.txt", "Frame002.txt",
"Frame003.txt", "Frame004.txt", "Frame005.txt")))
I have tried using the ellipse function in R without much success. A similar question on stack exchange used conf_ell but my R isn't liking the Momocs package/can't find the conf_ell function. So the farthest I've got at this point is to create a vector with the desired points...
ellipse.points <- array[6,,]
I want the result to be an ellipse of best fit, using the points contained within the array. Any help would be much appreciated, thankyou!

Local Polynomial Regression: Reading nearest neighbor smoothing constants from R code

I'm studying from a textbook on data mining and I can't figure out how the author reads the nn values from the gcv output. The code and output are below:
## cv
alpha <- seq(0.20, 1, by = 0.01)
n1 = length(alpha)
g = matrix(nrow = n1, ncol = 4)
for (k in 1:length(alpha)) {
g[k,] <- gcv(NOx ~ lp(EquivRatio, nn = alpha[k]), data = ethanol)
}
g
the csv file is here:
https://github.com/jgscott/ECO395M/blob/master/data/ethanol.csv
I'm usin locfit library in R.
How do you find with given output?
The nn values are not read from the output - they are given in the input. In the loop, nn is assigned as the kth value of the object alpha.
Let's look at the output of the first 16 rows of g, which is the same as the picture you included in your question:
g[1:16,]
#> [,1] [,2] [,3] [,4]
#> [1,] -3.220084 18.81266 16.426487 0.1183932
#> [2,] -3.249601 17.61614 15.436227 0.1154507
#> [3,] -3.319650 16.77004 14.752039 0.1151542
#> [4,] -3.336464 15.44404 13.889209 0.1115457
#> [5,] -3.373011 14.52391 13.115430 0.1099609
#> [6,] -3.408908 13.96789 12.634934 0.1094681
#> [7,] -3.408908 13.96789 12.634934 0.1094681
#> [8,] -3.469254 12.99316 11.830996 0.1085293
#> [9,] -3.504310 12.38808 11.283837 0.1078784
#> [10,] -3.529167 11.93838 10.928859 0.1073628
#> [11,] -3.546728 11.46960 10.516520 0.1065792
#> [12,] -3.552238 11.26372 10.322329 0.1061728
#> [13,] -3.576083 11.03575 10.135243 0.1062533
#> [14,] -3.679128 10.54096 9.662613 0.1079229
#> [15,] -3.679128 10.54096 9.662613 0.1079229
#> [16,] -3.699044 10.46534 9.578396 0.1082955
Note that rows 11, 12 and 13 were created inside your loop using alpha[11], alpha[12] and alpha[13]. These values were passed to the nn argument of lp. If you want the nn values included in your table, all you need to do is:
cbind(g, nn = alpha)
#> nn
#> [1,] -3.220084 18.812657 16.426487 0.1183932 0.20
#> [2,] -3.249601 17.616143 15.436227 0.1154507 0.21
#> [3,] -3.319650 16.770041 14.752039 0.1151542 0.22
#> [4,] -3.336464 15.444040 13.889209 0.1115457 0.23
#> [5,] -3.373011 14.523910 13.115430 0.1099609 0.24
#> [6,] -3.408908 13.967891 12.634934 0.1094681 0.25
#> [7,] -3.408908 13.967891 12.634934 0.1094681 0.26
#> [8,] -3.469254 12.993165 11.830996 0.1085293 0.27
#> [9,] -3.504310 12.388077 11.283837 0.1078784 0.28
#> [10,] -3.529167 11.938379 10.928859 0.1073628 0.29
#> [11,] -3.546728 11.469598 10.516520 0.1065792 0.30
#> [12,] -3.552238 11.263716 10.322329 0.1061728 0.31
#> [13,] -3.576083 11.035752 10.135243 0.1062533 0.32
#> [14,] -3.679128 10.540964 9.662613 0.1079229 0.33
#> [15,] -3.679128 10.540964 9.662613 0.1079229 0.34
#> [16,] -3.699044 10.465337 9.578396 0.1082955 0.35

r igraph - how does plot() read the layout matrix?

My question is related to this one here, which unfortunately has not been responded. I'm trying to automatically annotate text next to highlighted communities on a plot. An intermediate step is to understand how nodes are placed on a plot.
G <- make_graph('zachary')
l <- layout_with_fr(G)
l
A layout is a matrix with rows representing nodes and columns representing the x and y plot parameters.
[,1] [,2]
[1,] 2.8510654 -2.2404898
[2,] 2.7183497 -1.1815130
[3,] 3.1429205 0.1117099
[4,] 1.5585372 -1.0743325
[5,] 2.2808632 -4.2035479
[6,] 2.1698198 -5.0526766
[7,] 1.4938068 -4.6975884
[8,] 1.9710816 -1.4672218
[9,] 3.5407035 0.5407852
[10,] 2.2222909 1.9079805
[11,] 3.0784642 -4.5828448
[12,] 4.4115351 -4.1057462
[13,] 0.6002378 -2.2432049
[14,] 2.5010525 -0.1563341
[15,] 4.8914673 4.1417759
[16,] 3.2053338 3.9212694
[17,] 1.1825200 -6.4099021
[18,] 3.7155897 -2.8354432
[19,] 3.8272351 4.2660906
[20,] 3.8636487 -0.5671906
[21,] 2.7302411 3.3998888
[22,] 1.6084374 -2.7407388
[23,] 4.3432855 3.8101278
[24,] 5.9392042 2.2364929
[25,] 6.9980077 0.2389222
[26,] 7.1608499 1.1360134
[27,] 6.0171481 4.0279067
[28,] 5.4996627 1.0367163
[29,] 4.4961257 0.9434659
[30,] 5.5987563 3.2314488
[31,] 2.9958404 1.2022317
[32,] 5.1188900 0.2919268
[33,] 4.1088296 2.5032294
[34,] 4.1686534 2.1339884
But the x, y coordinates of the plot go from -1 to 1, unlike the min-max coordinates in the layout matrix. So how is plot(G, layout = l) reading the layout matrix?
The according to the source, the plot method for objects of class igraph simply rescales the matrix from -1 to 1.
library(igraph)
set.seed(3)
l <- layout_with_fr(G)
[,1] [,2]
[1,] -2.283 0.658
[2,] -1.289 -0.108
[3,] 0.146 1.012
[4,] -1.523 1.601
#... with 30 more rows.
plot(G,layout = l)
maxs <- apply(l, 2, max)
mins <- apply(l, 2, min)
ll <- scale(l, center=(maxs+mins)/2, scale=(maxs-mins)/2)
ll
[,1] [,2]
[1,] -0.2422 -0.1051
[2,] -0.0704 -0.3821
[3,] 0.1775 0.0228
[4,] -0.1108 0.2357
#... with 30 more rows.
plot(G,layout = ll)
Note that the actual rescaling is performed with igraph::norm_coords:
igraph::norm_coords(l)
[,1] [,2]
[1,] -0.2422 -0.1051
[2,] -0.0704 -0.3821
[3,] 0.1775 0.0228
[4,] -0.1108 0.2357
#... with 30 more rows.

How to split these ingredients that are followed by a different numbers of abbreviations within brackets

Here is an excerpt of what my data looks like:
"glyphosate (HBC), atrazine (HBC), metam potassium (FUN, HBC, INS, NEM), dichloropropene (HBC, NEM), metolachlor(-s) (HBC), chlorothalonil (FUN), chloropicrin (NEM), bacillus amyloliquifacien (FUN), 2,4-d (HBC, PGR)"
I want this example to look like this:
I know all the possible abbreviations (HBC, FUN etc) if that helps.
This is what I have tried so far.
str = "glyphosate (HBC), atrazine (HBC), dichloropropene (HBC, NEM), metolachlor(-s) (HBC), chlorothalonil (FUN), chloropicrin (NEM), bacillus amyloliquifacien (FUN), 2,4-d (HBC, PGR), pendimethalin (HBC), metam (FUN, HBC, INS, NEM), acetochlor (HBC), metribuzin (HBC), dicamba (HBC), phorate (INS), chlorpyrifos (ACA, INS), flutolanil (FUN), paraquat (HBC), propazine (HBC), dimethenamid(-p) (HBC, FUN), bromoxynil (HBC)"
vec = unlist(unlist(strsplit(str, " ()")))
vec_clean = gsub('^\\(|\\),|\\,|)$', '', vec)
matrix(vec_clean,nrow = 24,ncol = 2, byrow = TRUE)
Which gives me:
> matrix(vec_clean,nrow = 24,ncol = 2, byrow = TRUE)
[,1] [,2]
[1,] "glyphosate" "HBC"
[2,] "atrazine" "HBC"
[3,] "dichloropropene" "HBC"
[4,] "NEM" "metolachlor(-s"
[5,] "HBC" "chlorothalonil"
[6,] "FUN" "chloropicrin"
[7,] "NEM" "bacillus"
[8,] "amyloliquifacien" "FUN"
[9,] "24-d" "HBC"
[10,] "PGR" "pendimethalin"
[11,] "HBC" "metam"
[12,] "FUN" "HBC"
[13,] "INS" "NEM"
[14,] "acetochlor" "HBC"
[15,] "metribuzin" "HBC"
[16,] "dicamba" "HBC"
[17,] "phorate" "INS"
[18,] "chlorpyrifos" "ACA"
[19,] "INS" "flutolanil"
[20,] "FUN" "paraquat"
[21,] "HBC" "propazine"
[22,] "HBC" "dimethenamid(-p"
[23,] "HBC" "FUN"
[24,] "bromoxynil" "HBC"
The argument I use in str split " ()" was due to trial and error and I don't understand exactly why this works to some extent. I tried removing the leading and trailing "(" and ",)" using an example that I modified from here:
conditionally remove leading or trailing `.` character in R
So it is a start but when an ingredient har more abbreviations e.g. (FUN, HBC etc) it also splits it there. It also removes the "," from 2,4-d and from between abrreviations which it shouldn't. It removes the trailing ")" from dimethenamid(-p) which it also shouldn't.
Using stringr::str_match_all
stringr::str_match_all(str, ",?\\s?(.*?)\\s\\((.*?)\\),")[[1]][, -1]
# [,1] [,2]
# [1,] "glyphosate" "HBC"
# [2,] "atrazine" "HBC"
# [3,] "dichloropropene" "HBC, NEM"
# [4,] "metolachlor(-s)" "HBC"
# [5,] "chlorothalonil" "FUN"
# [6,] "chloropicrin" "NEM"
# [7,] "bacillus amyloliquifacien" "FUN"
# [8,] "2,4-d" "HBC, PGR"
# [9,] "pendimethalin" "HBC"
#[10,] "metam" "FUN, HBC, INS, NEM"
#[11,] "acetochlor" "HBC"
#[12,] "metribuzin" "HBC"
#[13,] "dicamba" "HBC"
#[14,] "phorate" "INS"
#[15,] "chlorpyrifos" "ACA, INS"
#[16,] "flutolanil" "FUN"
#[17,] "paraquat" "HBC"
#[18,] "propazine" "HBC"
#[19,] "dimethenamid(-p)" "HBC, FUN"
We extract two parts from each value. First part starts with an optional comma and whitespace (for 2nd value onwards) until an opening round bracket is encountered (() and second part is everything inside opening and closing round brackets.
Here is a base R solution:
str = "glyphosate (HBC), atrazine (HBC), dichloropropene (HBC, NEM), metolachlor(-s) (HBC), chlorothalonil (FUN), chloropicrin (NEM), bacillus amyloliquifacien (FUN), 2,4-d (HBC, PGR), pendimethalin (HBC), metam (FUN, HBC, INS, NEM), acetochlor (HBC), metribuzin (HBC), dicamba (HBC), phorate (INS), chlorpyrifos (ACA, INS), flutolanil (FUN), paraquat (HBC), propazine (HBC), dimethenamid(-p) (HBC, FUN), bromoxynil (HBC)"
out <- strsplit(str, "(?<=\\)), ", perl=TRUE)[[1]]
out <- strsplit(out, "( \\()|(\\)$)", perl=TRUE)
do.call(rbind, out)
#> [,1] [,2]
#> [1,] "glyphosate" "HBC"
#> [2,] "atrazine" "HBC"
#> [3,] "dichloropropene" "HBC, NEM"
#> [4,] "metolachlor(-s)" "HBC"
#> [5,] "chlorothalonil" "FUN"
#> [6,] "chloropicrin" "NEM"
#> [7,] "bacillus amyloliquifacien" "FUN"
#> [8,] "2,4-d" "HBC, PGR"
#> [9,] "pendimethalin" "HBC"
#> [10,] "metam" "FUN, HBC, INS, NEM"
#> [11,] "acetochlor" "HBC"
#> [12,] "metribuzin" "HBC"
#> [13,] "dicamba" "HBC"
#> [14,] "phorate" "INS"
#> [15,] "chlorpyrifos" "ACA, INS"
#> [16,] "flutolanil" "FUN"
#> [17,] "paraquat" "HBC"
#> [18,] "propazine" "HBC"
#> [19,] "dimethenamid(-p)" "HBC, FUN"
#> [20,] "bromoxynil" "HBC"
Created on 2020-04-05 by the reprex package (v0.3.0)

Resources