One weiRd tip Cut the time it takes to analyse data

Subsetting Dataframes by Column Name with Regular Expressions

Description

Selecting columns of a dataframe with regular expressions.

Code Snippet/Console Buffer Yank

Lets make a test set of data. Column names that follow some sort of system will make this example easier to understand.

> CN.df <- expand.grid(LETTERS, month.abb)
> 
> head(CN.df)
  Var1 Var2
1    A  Jan
2    B  Jan
3    C  Jan
4    D  Jan
5    E  Jan
6    F  Jan
> 
> tail(CN.df)
    Var1 Var2
307    U  Dec
308    V  Dec
309    W  Dec
310    X  Dec
311    Y  Dec
312    Z  Dec
> 
> CN.df$CN <- paste(CN.df$Var1, CN.df$Var2, sep = '_')
> CN.df$CN
  [1] "A_Jan" "B_Jan" "C_Jan" "D_Jan" "E_Jan" "F_Jan" "G_Jan" "H_Jan" "I_Jan"
 [10] "J_Jan" "K_Jan" "L_Jan" "M_Jan" "N_Jan" "O_Jan" "P_Jan" "Q_Jan" "R_Jan"
 [19] "S_Jan" "T_Jan" "U_Jan" "V_Jan" "W_Jan" "X_Jan" "Y_Jan" "Z_Jan" "A_Feb"
 [28] "B_Feb" "C_Feb" "D_Feb" "E_Feb" "F_Feb" "G_Feb" "H_Feb" "I_Feb" "J_Feb"
 [37] "K_Feb" "L_Feb" "M_Feb" "N_Feb" "O_Feb" "P_Feb" "Q_Feb" "R_Feb" "S_Feb"
 [46] "T_Feb" "U_Feb" "V_Feb" "W_Feb" "X_Feb" "Y_Feb" "Z_Feb" "A_Mar" "B_Mar"
 [55] "C_Mar" "D_Mar" "E_Mar" "F_Mar" "G_Mar" "H_Mar" "I_Mar" "J_Mar" "K_Mar"
 [64] "L_Mar" "M_Mar" "N_Mar" "O_Mar" "P_Mar" "Q_Mar" "R_Mar" "S_Mar" "T_Mar"
 [73] "U_Mar" "V_Mar" "W_Mar" "X_Mar" "Y_Mar" "Z_Mar" "A_Apr" "B_Apr" "C_Apr"
 [82] "D_Apr" "E_Apr" "F_Apr" "G_Apr" "H_Apr" "I_Apr" "J_Apr" "K_Apr" "L_Apr"
 [91] "M_Apr" "N_Apr" "O_Apr" "P_Apr" "Q_Apr" "R_Apr" "S_Apr" "T_Apr" "U_Apr"
[100] "V_Apr" "W_Apr" "X_Apr" "Y_Apr" "Z_Apr" "A_May" "B_May" "C_May" "D_May"
[109] "E_May" "F_May" "G_May" "H_May" "I_May" "J_May" "K_May" "L_May" "M_May"
[118] "N_May" "O_May" "P_May" "Q_May" "R_May" "S_May" "T_May" "U_May" "V_May"
[127] "W_May" "X_May" "Y_May" "Z_May" "A_Jun" "B_Jun" "C_Jun" "D_Jun" "E_Jun"
[136] "F_Jun" "G_Jun" "H_Jun" "I_Jun" "J_Jun" "K_Jun" "L_Jun" "M_Jun" "N_Jun"
[145] "O_Jun" "P_Jun" "Q_Jun" "R_Jun" "S_Jun" "T_Jun" "U_Jun" "V_Jun" "W_Jun"
[154] "X_Jun" "Y_Jun" "Z_Jun" "A_Jul" "B_Jul" "C_Jul" "D_Jul" "E_Jul" "F_Jul"
[163] "G_Jul" "H_Jul" "I_Jul" "J_Jul" "K_Jul" "L_Jul" "M_Jul" "N_Jul" "O_Jul"
[172] "P_Jul" "Q_Jul" "R_Jul" "S_Jul" "T_Jul" "U_Jul" "V_Jul" "W_Jul" "X_Jul"
[181] "Y_Jul" "Z_Jul" "A_Aug" "B_Aug" "C_Aug" "D_Aug" "E_Aug" "F_Aug" "G_Aug"
[190] "H_Aug" "I_Aug" "J_Aug" "K_Aug" "L_Aug" "M_Aug" "N_Aug" "O_Aug" "P_Aug"
[199] "Q_Aug" "R_Aug" "S_Aug" "T_Aug" "U_Aug" "V_Aug" "W_Aug" "X_Aug" "Y_Aug"
[208] "Z_Aug" "A_Sep" "B_Sep" "C_Sep" "D_Sep" "E_Sep" "F_Sep" "G_Sep" "H_Sep"
[217] "I_Sep" "J_Sep" "K_Sep" "L_Sep" "M_Sep" "N_Sep" "O_Sep" "P_Sep" "Q_Sep"
[226] "R_Sep" "S_Sep" "T_Sep" "U_Sep" "V_Sep" "W_Sep" "X_Sep" "Y_Sep" "Z_Sep"
[235] "A_Oct" "B_Oct" "C_Oct" "D_Oct" "E_Oct" "F_Oct" "G_Oct" "H_Oct" "I_Oct"
[244] "J_Oct" "K_Oct" "L_Oct" "M_Oct" "N_Oct" "O_Oct" "P_Oct" "Q_Oct" "R_Oct"
[253] "S_Oct" "T_Oct" "U_Oct" "V_Oct" "W_Oct" "X_Oct" "Y_Oct" "Z_Oct" "A_Nov"
[262] "B_Nov" "C_Nov" "D_Nov" "E_Nov" "F_Nov" "G_Nov" "H_Nov" "I_Nov" "J_Nov"
[271] "K_Nov" "L_Nov" "M_Nov" "N_Nov" "O_Nov" "P_Nov" "Q_Nov" "R_Nov" "S_Nov"
[280] "T_Nov" "U_Nov" "V_Nov" "W_Nov" "X_Nov" "Y_Nov" "Z_Nov" "A_Dec" "B_Dec"
[289] "C_Dec" "D_Dec" "E_Dec" "F_Dec" "G_Dec" "H_Dec" "I_Dec" "J_Dec" "K_Dec"
[298] "L_Dec" "M_Dec" "N_Dec" "O_Dec" "P_Dec" "Q_Dec" "R_Dec" "S_Dec" "T_Dec"
[307] "U_Dec" "V_Dec" "W_Dec" "X_Dec" "Y_Dec" "Z_Dec"
> 
> Data <- matrix(data = rnorm(n = 100*nrow(CN.df), mean = 0, sd = 1), ncol = nrow(CN.df))
> 
> dim(Data)
[1] 100 312
> 
> colnames(Data) <- CN.df$CN

We’re now going to select columns using regular expressions.

Let’s practise this just with the column names themselves first:

> grep(pattern = 'Feb', x = colnames(Data))
 [1] 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51
[26] 52

and we get the element numbers in the vector colnames(Data) that contain the string Feb

> colnames(Data)[grep(pattern = 'Feb', x = colnames(Data))]
 [1] "A_Feb" "B_Feb" "C_Feb" "D_Feb" "E_Feb" "F_Feb" "G_Feb" "H_Feb" "I_Feb"
[10] "J_Feb" "K_Feb" "L_Feb" "M_Feb" "N_Feb" "O_Feb" "P_Feb" "Q_Feb" "R_Feb"
[19] "S_Feb" "T_Feb" "U_Feb" "V_Feb" "W_Feb" "X_Feb" "Y_Feb" "Z_Feb"

these are also column numbers in the dataframe Data…

> head(Data[, grep(pattern = 'Feb', x = colnames(Data))])
          A_Feb      B_Feb      C_Feb       D_Feb      E_Feb      F_Feb
[1,] -0.6841265 -1.1817348  1.6807043 -0.39257311 -0.9515360  0.8008152
[2,] -0.6779050 -0.9164804  1.9647272  0.07364766 -1.4755660 -1.6456495
[3,] -0.1795740 -3.8315958  0.1533694  0.21030433  1.4679366  0.6412827
[4,] -0.7233379  1.3716940  0.1337730  0.04705661  1.5007768  0.6619927
[5,]  0.5472112  1.7222713 -0.5200483  1.18211624 -0.7153904  0.2553364
[6,] -1.0596905 -1.0081476 -0.6045363  0.62751178  1.5831821 -0.5330454
          G_Feb      H_Feb      I_Feb       J_Feb      K_Feb      L_Feb
[1,]  1.1841829  0.2329420 -2.1188408 -0.25214828 -0.1864550  0.5949313
[2,]  1.9144663 -0.4707447 -0.9954586 -1.66399634  0.5760829  0.1251184
[3,]  0.2713321  0.1858492  0.5804353  0.01503721  0.4025628  0.1949168
[4,] -0.3175766 -1.6206853  0.4731099 -0.88368914 -1.1853159 -0.7878871
[5,]  0.6669143 -0.5031496 -1.8234294 -0.79414369 -0.2683703  0.3607956
[6,] -1.0852894  1.4511176  1.1685282  0.71199378  0.7478227 -0.2974355
          M_Feb      N_Feb       O_Feb      P_Feb      Q_Feb       R_Feb
[1,] -1.3881478  0.2735816 -1.76074556 -1.0148078 -0.6016140  1.32945170
[2,]  0.6753580 -1.1201647 -0.55614611  0.7881342 -0.7993316  1.42795248
[3,]  1.6042324 -0.3887248  0.73089443 -0.4796499 -0.2507031 -1.36970289
[4,]  0.3604392 -1.6558602 -1.18762489  0.4725448 -1.8988363  0.14019003
[5,]  0.2355033  3.3008920  2.10665129  0.6767839  0.4951569  2.13241527
[6,] -1.5474530  0.4436130  0.02131928 -0.7797122  0.8099167  0.06482327
            S_Feb       T_Feb      U_Feb      V_Feb      W_Feb      X_Feb
[1,]  0.880824846 -0.51746541  1.0710203 -1.1793858 -0.5530662 -0.4860293
[2,]  0.002691101 -0.09073245  0.8086777  1.6553748 -0.4882194  2.8290856
[3,] -1.305339790  0.84129879 -0.2418693 -0.5594489  0.5807339 -1.5567147
[4,]  1.639555249  0.01454536 -0.3562965 -0.3608921  0.3614061  0.1387805
[5,] -0.429934749 -0.31134244 -1.4999730 -0.4752920 -0.3603784  1.1385050
[6,]  0.241391062  1.31354226  0.1322101  1.3959864 -0.5923838  0.6377891
          Y_Feb      Z_Feb
[1,]  1.0007115  0.2933871
[2,] -0.4424484 -0.9198374
[3,] -0.3196309 -1.1980152
[4,]  1.0302725  0.4518252
[5,]  0.6680153  0.7511440
[6,]  0.2369676 -0.6112372

Let’s now extract all the column names that begin with F. Note: we can’t just search for all column names that contain the upper case letter F because we’ll get all the columns that contain ‘Feb’ in their names…

> colnames(Data)[grep(pattern = 'F', x = colnames(Data))]
 [1] "F_Jan" "A_Feb" "B_Feb" "C_Feb" "D_Feb" "E_Feb" "F_Feb" "G_Feb" "H_Feb"
[10] "I_Feb" "J_Feb" "K_Feb" "L_Feb" "M_Feb" "N_Feb" "O_Feb" "P_Feb" "Q_Feb"
[19] "R_Feb" "S_Feb" "T_Feb" "U_Feb" "V_Feb" "W_Feb" "X_Feb" "Y_Feb" "Z_Feb"
[28] "F_Mar" "F_Apr" "F_May" "F_Jun" "F_Jul" "F_Aug" "F_Sep" "F_Oct" "F_Nov"
[37] "F_Dec"

we need a simple regular expression

> colnames(Data)[grep(pattern = '^F', x = colnames(Data))]
 [1] "F_Jan" "F_Feb" "F_Mar" "F_Apr" "F_May" "F_Jun" "F_Jul" "F_Aug" "F_Sep"
[10] "F_Oct" "F_Nov" "F_Dec"

the ^ means start of line

> F.Data <- Data[,grep(pattern = '^F', x = colnames(Data))]
> 
> head(F.Data)
          F_Jan      F_Feb       F_Mar      F_Apr        F_May      F_Jun
[1,] -0.2745363  0.8008152 -0.21815602  0.1210484 -0.005076821 0.01058365
[2,] -0.2679376 -1.6456495 -0.69573333  0.7255740  1.231271980 1.32665308
[3,] -0.4451868  0.6412827 -0.50548581 -2.6556828  0.046676109 2.68229178
[4,] -1.2374888  0.6619927 -1.46774735 -0.5745960 -0.923580718 0.97393985
[5,] -1.2218576  0.2553364 -0.02558555 -0.5925002  1.644321061 0.11406815
[6,]  0.8826744 -0.5330454 -0.11360542 -0.6938523  0.758702611 0.76233907
           F_Jul      F_Aug      F_Sep      F_Oct       F_Nov       F_Dec
[1,]  1.55740051 -0.9127755  0.8580327 -0.4472975  0.81835698  1.02912905
[2,]  0.05448689  2.0270125 -0.4244351 -0.1366634  0.03534011 -0.67561827
[3,]  0.21931733 -1.0393318  0.4258849 -1.1134557  0.12469384  0.90877361
[4,] -0.28482266 -1.4338899 -0.4850225 -1.3333187  0.18724145 -0.26304415
[5,]  0.26891692  1.3171009  0.4208927 -0.4915738 -0.92508066  0.03629915
[6,] -0.55537607  1.3738066  1.4363442 -2.3209636  0.90170461 -0.20421596

perhaps we no longer want all the columns in F.Data to have column names the start with F_

> F.Data.CN <- unlist(strsplit(x = colnames(F.Data), split = '_'))
> 
> F.Data.CN[!F.Data.CN == 'F']
 [1] "Jan" "Feb" "Mar" "Apr" "May" "Jun" "Jul" "Aug" "Sep" "Oct" "Nov" "Dec"
> 
> colnames(F.Data) <- F.Data.CN[!F.Data.CN == 'F']
> 
> head(F.Data)
            Jan        Feb         Mar        Apr          May        Jun
[1,] -0.2745363  0.8008152 -0.21815602  0.1210484 -0.005076821 0.01058365
[2,] -0.2679376 -1.6456495 -0.69573333  0.7255740  1.231271980 1.32665308
[3,] -0.4451868  0.6412827 -0.50548581 -2.6556828  0.046676109 2.68229178
[4,] -1.2374888  0.6619927 -1.46774735 -0.5745960 -0.923580718 0.97393985
[5,] -1.2218576  0.2553364 -0.02558555 -0.5925002  1.644321061 0.11406815
[6,]  0.8826744 -0.5330454 -0.11360542 -0.6938523  0.758702611 0.76233907
             Jul        Aug        Sep        Oct         Nov         Dec
[1,]  1.55740051 -0.9127755  0.8580327 -0.4472975  0.81835698  1.02912905
[2,]  0.05448689  2.0270125 -0.4244351 -0.1366634  0.03534011 -0.67561827
[3,]  0.21931733 -1.0393318  0.4258849 -1.1134557  0.12469384  0.90877361
[4,] -0.28482266 -1.4338899 -0.4850225 -1.3333187  0.18724145 -0.26304415
[5,]  0.26891692  1.3171009  0.4208927 -0.4915738 -0.92508066  0.03629915
[6,] -0.55537607  1.3738066  1.4363442 -2.3209636  0.90170461 -0.20421596

How about some slightly more involved regular expressions?

> colnames(Data)[grep(pattern = '(^F|^Z).Feb$', x = colnames(Data))]
[1] "F_Feb" "Z_Feb"
> 
> colnames(Data)[grep(pattern = '(^F|^Z).(Feb$|Ma.$)', x = colnames(Data))]
[1] "F_Feb" "Z_Feb" "F_Mar" "Z_Mar" "F_May" "Z_May"
> 
> head(Data[, grep(pattern = '(^F|^Z).(Feb$|Ma.$)', x = colnames(Data))])
          F_Feb      Z_Feb       F_Mar     Z_Mar        F_May      Z_May
[1,]  0.8008152  0.2933871 -0.21815602 -1.568861 -0.005076821 -0.7032838
[2,] -1.6456495 -0.9198374 -0.69573333  1.918501  1.231271980 -0.1727215
[3,]  0.6412827 -1.1980152 -0.50548581  1.080280  0.046676109  1.5011851
[4,]  0.6619927  0.4518252 -1.46774735 -1.477020 -0.923580718  0.4603256
[5,]  0.2553364  0.7511440 -0.02558555  1.215124  1.644321061 -1.9630911
[6,] -0.5330454 -0.6112372 -0.11360542 -1.192837  0.758702611  0.2811076

That concludes this example.

Read more about regular expressions in R with:

> ?regexp
comments powered by Disqus