Movatterモバイル変換


[0]ホーム

URL:


urlparse

CRAN statusR-CMD-checkCodecov test coverageurlparse status badge

Fast and simple url parser for R. Initially developed for thepaws.common package.

urlparse::url_parse("https://user:pass@host.com:8000/path?query=1#fragment")#> $scheme#> [1] "https"#>#> $user#> [1] "user"#>#> $password#> [1] "pass"#>#> $host#> [1] "host.com"#>#> $port#> [1] "8000"#>#> $path#> [1] "/path"#>#> $raw_path#> [1] ""#>#> $query#> $query$query#> [1] "1"#>#>#> $raw_query#> [1] "query=1"#>#> $fragment#> [1] "fragment"

Installation

You can install the development version of urlparse like so:

remotes::install_github("dyfanjones/urlparse")

r-universe installation:

install.packages("urlparse",repos =c("https://dyfanjones.r-universe.dev","https://cloud.r-project.org"))

Example

This is a basic example which shows you how to solve a commonproblem:

library(urlparse)
url_encoder("foo = bar + 5")#> [1] "foo%20%3D%20bar%20%2B%205"url_decoder(url_encoder("foo = bar + 5"))#> [1] "foo = bar + 5"

Similar to python’sfrom urllib.parse import quote,urlparse::url_encoder supports thesafeparameter. The additional ASCII characters that should not beencoded.

from urllib.parseimport quotequote("foo = bar + 5", safe="+")#> 'foo%20%3D%20bar%20+%205'
url_encoder("foo = bar + 5",safe ="+")#> [1] "foo%20%3D%20bar%20+%205"

Modify anurl through piping using theset_* functions or using the stand aloneurl_modify function.

url<-"http://example.com"set_scheme(url,"https")|>set_port(1234L)|>set_path("foo/bar")|>set_query("baz")|>set_fragment("quux")#> [1] "https://example.com:1234/foo/bar?baz#quux"url_modify(url,scheme ="https",port =1234,path ="foo/bar",query ="baz",fragment ="quux")#> [1] "https://example.com:1234/foo/bar?baz#quux"

Note: it is faster to useurl_modify rather than pipingtheset_* functions. This is becauseurlparsehas to parse the url within eachset_* to modify theurl.

url<-"http://example.com"bench::mark(piping = {set_scheme(url,"https")|>set_port(1234L)|>set_path("foo/bar")|>set_query("baz")|>set_fragment("quux")},single_function =url_modify(url,scheme ="https",port =1234,path ="foo/bar",query ="baz",fragment ="quux"))#> # A tibble: 2 × 6#>   expression           min   median `itr/sec` mem_alloc `gc/sec`#>   <bch:expr>      <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl>#> 1 piping            5.29µs    5.9µs   162854.        0B     16.3#> 2 single_function    1.6µs    1.8µs   517151.        0B     51.7

Benchmark:

Parsing URL:

url<-"https://user:pass@host.com:8000/path?query=1#fragment"(bm<- bench::mark(urlparse = urlparse::url_parse(url),httr2 = httr2::url_parse(url),curl = curl::curl_parse_url(url),urltools = urltools::url_parse(url),check = F))#> # A tibble: 4 × 6#>   expression      min   median `itr/sec` mem_alloc `gc/sec`#>   <bch:expr> <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl>#> 1 urlparse     1.72µs   1.93µs   488712.        0B      0#> 2 httr2       22.39µs  23.86µs    40406.  571.07KB     24.3#> 3 curl        27.06µs  29.64µs    29324.        0B     14.7#> 4 urltools   124.44µs 132.68µs     7119.    2.18MB     23.2show_relative(bm)#> # A tibble: 4 × 6#>   expression   min median `itr/sec` mem_alloc `gc/sec`#>   <bch:expr> <dbl>  <dbl>     <dbl>     <dbl>    <dbl>#> 1 urlparse     1      1       68.6        NaN      NaN#> 2 httr2       13.0   12.4      5.68       Inf      Inf#> 3 curl        15.7   15.4      4.12       NaN      Inf#> 4 urltools    72.3   68.9      1          Inf      Infggplot2::autoplot(bm)

Sinceurlpase v0.1.999+ you can use the vectorised urlparserurl_parser_v2

urls<-c("https://www.example.com","https://www.google.com/maps/place/Pennsylvania+Station/@40.7519848,-74.0015045,14.7z/data=!4m5!3m4!1s0x89c259ae15b2adcb:0x7955420634fd7eba!8m2!3d40.750568!4d-73.993519","https://user_1:password_1@example.org:8080/dir/../api?q=1#frag","https://user:password@example.com","https://www.example.com:8080/search%3D1%2B3","https://www.google.co.jp/search?q=\u30c9\u30a4\u30c4","https://www.example.com:8080?var1=foo&var2=ba%20r&var3=baz+larry","https://user:password@example.com:8080","https://user:password@example.com","https://user@example.com:8080","https://user@example.com")(bm<- bench::mark(urlparse =lapply(urls, urlparse::url_parse),urlparse_v2 = urlparse::url_parse_v2(urls),httr2 =lapply(urls, httr2::url_parse),curl =lapply(urls, curl::curl_parse_url),urltools = urltools::url_parse(urls),check = F))#> # A tibble: 5 × 6#>   expression       min   median `itr/sec` mem_alloc `gc/sec`#>   <bch:expr>  <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl>#> 1 urlparse      19.4µs   21.3µs    46214.      200B    13.9#> 2 urlparse_v2   10.5µs     11µs    87963.     4.3KB     0#> 3 httr2        171.6µs  181.6µs     5232.        0B    10.2#> 4 curl         188.7µs  198.4µs     4895.        0B     8.14#> 5 urltools       130µs  142.1µs     6569.        0B    10.2show_relative(bm)#> # A tibble: 5 × 6#>   expression    min median `itr/sec` mem_alloc `gc/sec`#>   <bch:expr>  <dbl>  <dbl>     <dbl>     <dbl>    <dbl>#> 1 urlparse     1.85   1.94      9.44       Inf      Inf#> 2 urlparse_v2  1      1        18.0        Inf      NaN#> 3 httr2       16.4   16.5       1.07       NaN      Inf#> 4 curl        18.0   18.1       1          NaN      Inf#> 5 urltools    12.4   12.9       1.34       NaN      Infggplot2::autoplot(bm)

Note:url_parse_v2 returns the parsed url as adata.frame this is similar behaviour tourltools andadaR:

urlparse::url_parse_v2(urls)#>                                                                                                                                                                       href#> 1                                                                                                                                                  https://www.example.com#> 2  https://www.google.com/maps/place/Pennsylvania+Station/@40.7519848,-74.0015045,14.7z/data=!4m5!3m4!1s0x89c259ae15b2adcb:0x7955420634fd7eba!8m2!3d40.750568!4d-73.993519#> 3                                                                                                           https://user_1:password_1@example.org:8080/dir/../api?q=1#frag#> 4                                                                                                                                        https://user:password@example.com#> 5                                                                                                                              https://www.example.com:8080/search%3D1%2B3#> 6                                                                                                                                 https://www.google.co.jp/search?q=ドイツ#> 7                                                                                                         https://www.example.com:8080?var1=foo&var2=ba%20r&var3=baz+larry#> 8                                                                                                                                   https://user:password@example.com:8080#> 9                                                                                                                                        https://user:password@example.com#> 10                                                                                                                                           https://user@example.com:8080#> 11                                                                                                                                                https://user@example.com#>    scheme   user   password             host port#> 1   https                    www.example.com#> 2   https                     www.google.com#> 3   https user_1 password_1      example.org 8080#> 4   https   user   password      example.com#> 5   https                    www.example.com 8080#> 6   https                   www.google.co.jp#> 7   https                    www.example.com 8080#> 8   https   user   password      example.com 8080#> 9   https   user   password      example.com#> 10  https   user                 example.com 8080#> 11  https   user                 example.com#>                                                                                                                                                 path#> 1#> 2  /maps/place/Pennsylvania+Station/@40.7519848,-74.0015045,14.7z/data=!4m5!3m4!1s0x89c259ae15b2adcb:0x7955420634fd7eba!8m2!3d40.750568!4d-73.993519#> 3                                                                                                                                        /dir/../api#> 4#> 5                                                                                                                                        /search=1+3#> 6                                                                                                                                            /search#> 7#> 8#> 9#> 10#> 11#>                                                                                                                                             raw_path#> 1#> 2  /maps/place/Pennsylvania+Station/@40.7519848,-74.0015045,14.7z/data=!4m5!3m4!1s0x89c259ae15b2adcb:0x7955420634fd7eba!8m2!3d40.750568!4d-73.993519#> 3#> 4#> 5                                                                                                                                    /search%3D1%2B3#> 6#> 7#> 8#> 9#> 10#> 11#>                                raw_query fragment#> 1#> 2#> 3                                    q=1     frag#> 4#> 5#> 6          q=%E3%83%89%E3%82%A4%E3%83%84#> 7  var1=foo&var2=ba%20r&var3=baz%2Blarry#> 8#> 9#> 10#> 11

Encoding URL:

Note:urltools encode special characters to lower casehex i.e.: “?” -> “%3f” instead of “%3F”

string<-"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-._~`!@#$%^&*()=+[{]}\\|;:'\",<>/? "(bm<- bench::mark(urlparse = urlparse::url_encoder(string),curl = curl::curl_escape(string),urltools = urltools::url_encode(string),base =URLencode(string,reserved = T),check = F))#> # A tibble: 4 × 6#>   expression      min   median `itr/sec` mem_alloc `gc/sec`#>   <bch:expr> <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl>#> 1 urlparse     1.48µs   1.64µs   581380.      208B     0#> 2 curl         2.25µs   2.58µs   349595.    3.03KB     0#> 3 urltools     2.34µs   2.54µs   381930.    2.48KB     0#> 4 base        78.84µs  82.33µs    11746.   28.59KB     8.25show_relative(bm)#> # A tibble: 4 × 6#>   expression   min median `itr/sec` mem_alloc `gc/sec`#>   <bch:expr> <dbl>  <dbl>     <dbl>     <dbl>    <dbl>#> 1 urlparse    1      1         49.5       1        NaN#> 2 curl        1.53   1.57      29.8      14.9      NaN#> 3 urltools    1.58   1.55      32.5      12.2      NaN#> 4 base       53.4   50.2        1       141.       Infggplot2::autoplot(bm)

string<-"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-._~`!@#$%^&*()=+[{]}\\|;:'\",<>/? "url<-paste0(sample(strsplit(string,"")[[1]],1e4,replace =TRUE),collapse ="")(bm<- bench::mark(urlparse = urlparse::url_encoder(url),curl = curl::curl_escape(url),urltools = urltools::url_encode(url),base =URLencode(url,reserved = T,repeated = T),check = F,filter_gc = F))#> # A tibble: 4 × 6#>   expression      min   median `itr/sec` mem_alloc `gc/sec`#>   <bch:expr> <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl>#> 1 urlparse     86.4µs   88.3µs    10599.    15.8KB     0#> 2 curl         91.6µs   94.9µs    10306.        0B     0#> 3 urltools    241.8µs  247.8µs     3943.    15.8KB     0#> 4 base          6.7ms      7ms      138.   333.5KB     8.00show_relative(bm)#> # A tibble: 4 × 6#>   expression   min median `itr/sec` mem_alloc `gc/sec`#>   <bch:expr> <dbl>  <dbl>     <dbl>     <dbl>    <dbl>#> 1 urlparse    1      1         76.8       Inf      NaN#> 2 curl        1.06   1.08      74.7       NaN      NaN#> 3 urltools    2.80   2.81      28.6       Inf      NaN#> 4 base       77.6   79.3        1         Inf      Infggplot2::autoplot(bm)


[8]ページ先頭

©2009-2025 Movatter.jp