neon_arch commited on
Commit
221f38c
2 Parent(s): e4625c3 a28d559

Merge pull request #146 from neon-mmd/improve-async-multithreading

Browse files
Cargo.lock CHANGED
@@ -292,6 +292,17 @@ version = "0.10.3"
292
  source = "registry+https://github.com/rust-lang/crates.io-index"
293
  checksum = "619743e34b5ba4e9703bba34deac3427c72507c7159f5fd030aea8cac0cfe341"
294
 
 
 
 
 
 
 
 
 
 
 
 
295
  [[package]]
296
  name = "autocfg"
297
  version = "0.1.8"
@@ -506,18 +517,18 @@ dependencies = [
506
 
507
  [[package]]
508
  name = "clap"
509
- version = "4.3.11"
510
  source = "registry+https://github.com/rust-lang/crates.io-index"
511
- checksum = "1640e5cc7fb47dbb8338fd471b105e7ed6c3cb2aeb00c2e067127ffd3764a05d"
512
  dependencies = [
513
  "clap_builder",
514
  ]
515
 
516
  [[package]]
517
  name = "clap_builder"
518
- version = "4.3.11"
519
  source = "registry+https://github.com/rust-lang/crates.io-index"
520
- checksum = "98c59138d527eeaf9b53f35a77fcc1fad9d883116070c63d5de1c7dc7b00c72b"
521
  dependencies = [
522
  "anstyle",
523
  "clap_lex",
@@ -784,7 +795,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
784
  checksum = "13b588ba4ac1a99f7f2964d24b3d896ddc6bf847ee3855dbd4366f058cfcd331"
785
  dependencies = [
786
  "quote 1.0.29",
787
- "syn 2.0.25",
788
  ]
789
 
790
  [[package]]
@@ -1457,7 +1468,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
1457
  checksum = "cb0889898416213fab133e1d33a0e5858a48177452750691bde3666d0fdbaf8b"
1458
  dependencies = [
1459
  "hermit-abi",
1460
- "rustix 0.38.3",
1461
  "windows-sys",
1462
  ]
1463
 
@@ -1834,7 +1845,7 @@ checksum = "a948666b637a0f465e8564c73e89d4dde00d72d4d473cc972f390fc3dcee7d9c"
1834
  dependencies = [
1835
  "proc-macro2 1.0.64",
1836
  "quote 1.0.29",
1837
- "syn 2.0.25",
1838
  ]
1839
 
1840
  [[package]]
@@ -1952,7 +1963,7 @@ dependencies = [
1952
  "pest_meta",
1953
  "proc-macro2 1.0.64",
1954
  "quote 1.0.29",
1955
- "syn 2.0.25",
1956
  ]
1957
 
1958
  [[package]]
@@ -2054,7 +2065,7 @@ dependencies = [
2054
  "phf_shared 0.11.2",
2055
  "proc-macro2 1.0.64",
2056
  "quote 1.0.29",
2057
- "syn 2.0.25",
2058
  ]
2059
 
2060
  [[package]]
@@ -2398,9 +2409,9 @@ dependencies = [
2398
 
2399
  [[package]]
2400
  name = "regex-automata"
2401
- version = "0.3.2"
2402
  source = "registry+https://github.com/rust-lang/crates.io-index"
2403
- checksum = "83d3daa6976cffb758ec878f108ba0e062a45b2d6ca3a2cca965338855476caf"
2404
  dependencies = [
2405
  "aho-corasick",
2406
  "memchr",
@@ -2409,9 +2420,9 @@ dependencies = [
2409
 
2410
  [[package]]
2411
  name = "regex-syntax"
2412
- version = "0.7.3"
2413
  source = "registry+https://github.com/rust-lang/crates.io-index"
2414
- checksum = "2ab07dc67230e4a4718e70fd5c20055a4334b121f1f9db8fe63ef39ce9b8c846"
2415
 
2416
  [[package]]
2417
  name = "reqwest"
@@ -2548,9 +2559,9 @@ dependencies = [
2548
 
2549
  [[package]]
2550
  name = "rustix"
2551
- version = "0.38.3"
2552
  source = "registry+https://github.com/rust-lang/crates.io-index"
2553
- checksum = "ac5ffa1efe7548069688cd7028f32591853cd7b5b756d41bcffd2353e4fc75b4"
2554
  dependencies = [
2555
  "bitflags 2.3.3",
2556
  "errno",
@@ -2708,14 +2719,14 @@ checksum = "389894603bd18c46fa56231694f8d827779c0951a667087194cf9de94ed24682"
2708
  dependencies = [
2709
  "proc-macro2 1.0.64",
2710
  "quote 1.0.29",
2711
- "syn 2.0.25",
2712
  ]
2713
 
2714
  [[package]]
2715
  name = "serde_json"
2716
- version = "1.0.100"
2717
  source = "registry+https://github.com/rust-lang/crates.io-index"
2718
- checksum = "0f1e14e89be7aa4c4b78bdbdc9eb5bf8517829a600ae8eaa39a6e1d960b5185c"
2719
  dependencies = [
2720
  "itoa 1.0.8",
2721
  "ryu",
@@ -2937,9 +2948,9 @@ dependencies = [
2937
 
2938
  [[package]]
2939
  name = "syn"
2940
- version = "2.0.25"
2941
  source = "registry+https://github.com/rust-lang/crates.io-index"
2942
- checksum = "15e3fc8c0c74267e2df136e5e5fb656a464158aa57624053375eb9c8c6e25ae2"
2943
  dependencies = [
2944
  "proc-macro2 1.0.64",
2945
  "quote 1.0.29",
@@ -3009,7 +3020,7 @@ checksum = "463fe12d7993d3b327787537ce8dd4dfa058de32fc2b195ef3cde03dc4771e8f"
3009
  dependencies = [
3010
  "proc-macro2 1.0.64",
3011
  "quote 1.0.29",
3012
- "syn 2.0.25",
3013
  ]
3014
 
3015
  [[package]]
@@ -3164,7 +3175,7 @@ checksum = "630bdcf245f78637c13ec01ffae6187cca34625e8c63150d424b59e55af2675e"
3164
  dependencies = [
3165
  "proc-macro2 1.0.64",
3166
  "quote 1.0.29",
3167
- "syn 2.0.25",
3168
  ]
3169
 
3170
  [[package]]
@@ -3343,9 +3354,9 @@ checksum = "92888ba5573ff080736b3648696b70cafad7d250551175acbaa4e0385b3e1460"
3343
 
3344
  [[package]]
3345
  name = "unicode-ident"
3346
- version = "1.0.10"
3347
  source = "registry+https://github.com/rust-lang/crates.io-index"
3348
- checksum = "22049a19f4a68748a168c0fc439f9516686aa045927ff767eca0a85101fb6e73"
3349
 
3350
  [[package]]
3351
  name = "unicode-normalization"
@@ -3486,7 +3497,7 @@ dependencies = [
3486
  "once_cell",
3487
  "proc-macro2 1.0.64",
3488
  "quote 1.0.29",
3489
- "syn 2.0.25",
3490
  "wasm-bindgen-shared",
3491
  ]
3492
 
@@ -3520,7 +3531,7 @@ checksum = "54681b18a46765f095758388f2d0cf16eb8d4169b639ab575a8f5693af210c7b"
3520
  dependencies = [
3521
  "proc-macro2 1.0.64",
3522
  "quote 1.0.29",
3523
- "syn 2.0.25",
3524
  "wasm-bindgen-backend",
3525
  "wasm-bindgen-shared",
3526
  ]
@@ -3543,10 +3554,11 @@ dependencies = [
3543
 
3544
  [[package]]
3545
  name = "websurfx"
3546
- version = "0.13.17"
3547
  dependencies = [
3548
  "actix-files",
3549
  "actix-web",
 
3550
  "criterion",
3551
  "env_logger",
3552
  "error-stack",
 
292
  source = "registry+https://github.com/rust-lang/crates.io-index"
293
  checksum = "619743e34b5ba4e9703bba34deac3427c72507c7159f5fd030aea8cac0cfe341"
294
 
295
+ [[package]]
296
+ name = "async-trait"
297
+ version = "0.1.71"
298
+ source = "registry+https://github.com/rust-lang/crates.io-index"
299
+ checksum = "a564d521dd56509c4c47480d00b80ee55f7e385ae48db5744c67ad50c92d2ebf"
300
+ dependencies = [
301
+ "proc-macro2 1.0.64",
302
+ "quote 1.0.29",
303
+ "syn 2.0.26",
304
+ ]
305
+
306
  [[package]]
307
  name = "autocfg"
308
  version = "0.1.8"
 
517
 
518
  [[package]]
519
  name = "clap"
520
+ version = "4.3.12"
521
  source = "registry+https://github.com/rust-lang/crates.io-index"
522
+ checksum = "3eab9e8ceb9afdade1ab3f0fd8dbce5b1b2f468ad653baf10e771781b2b67b73"
523
  dependencies = [
524
  "clap_builder",
525
  ]
526
 
527
  [[package]]
528
  name = "clap_builder"
529
+ version = "4.3.12"
530
  source = "registry+https://github.com/rust-lang/crates.io-index"
531
+ checksum = "9f2763db829349bf00cfc06251268865ed4363b93a943174f638daf3ecdba2cd"
532
  dependencies = [
533
  "anstyle",
534
  "clap_lex",
 
795
  checksum = "13b588ba4ac1a99f7f2964d24b3d896ddc6bf847ee3855dbd4366f058cfcd331"
796
  dependencies = [
797
  "quote 1.0.29",
798
+ "syn 2.0.26",
799
  ]
800
 
801
  [[package]]
 
1468
  checksum = "cb0889898416213fab133e1d33a0e5858a48177452750691bde3666d0fdbaf8b"
1469
  dependencies = [
1470
  "hermit-abi",
1471
+ "rustix 0.38.4",
1472
  "windows-sys",
1473
  ]
1474
 
 
1845
  dependencies = [
1846
  "proc-macro2 1.0.64",
1847
  "quote 1.0.29",
1848
+ "syn 2.0.26",
1849
  ]
1850
 
1851
  [[package]]
 
1963
  "pest_meta",
1964
  "proc-macro2 1.0.64",
1965
  "quote 1.0.29",
1966
+ "syn 2.0.26",
1967
  ]
1968
 
1969
  [[package]]
 
2065
  "phf_shared 0.11.2",
2066
  "proc-macro2 1.0.64",
2067
  "quote 1.0.29",
2068
+ "syn 2.0.26",
2069
  ]
2070
 
2071
  [[package]]
 
2409
 
2410
  [[package]]
2411
  name = "regex-automata"
2412
+ version = "0.3.3"
2413
  source = "registry+https://github.com/rust-lang/crates.io-index"
2414
+ checksum = "39354c10dd07468c2e73926b23bb9c2caca74c5501e38a35da70406f1d923310"
2415
  dependencies = [
2416
  "aho-corasick",
2417
  "memchr",
 
2420
 
2421
  [[package]]
2422
  name = "regex-syntax"
2423
+ version = "0.7.4"
2424
  source = "registry+https://github.com/rust-lang/crates.io-index"
2425
+ checksum = "e5ea92a5b6195c6ef2a0295ea818b312502c6fc94dde986c5553242e18fd4ce2"
2426
 
2427
  [[package]]
2428
  name = "reqwest"
 
2559
 
2560
  [[package]]
2561
  name = "rustix"
2562
+ version = "0.38.4"
2563
  source = "registry+https://github.com/rust-lang/crates.io-index"
2564
+ checksum = "0a962918ea88d644592894bc6dc55acc6c0956488adcebbfb6e273506b7fd6e5"
2565
  dependencies = [
2566
  "bitflags 2.3.3",
2567
  "errno",
 
2719
  dependencies = [
2720
  "proc-macro2 1.0.64",
2721
  "quote 1.0.29",
2722
+ "syn 2.0.26",
2723
  ]
2724
 
2725
  [[package]]
2726
  name = "serde_json"
2727
+ version = "1.0.102"
2728
  source = "registry+https://github.com/rust-lang/crates.io-index"
2729
+ checksum = "b5062a995d481b2308b6064e9af76011f2921c35f97b0468811ed9f6cd91dfed"
2730
  dependencies = [
2731
  "itoa 1.0.8",
2732
  "ryu",
 
2948
 
2949
  [[package]]
2950
  name = "syn"
2951
+ version = "2.0.26"
2952
  source = "registry+https://github.com/rust-lang/crates.io-index"
2953
+ checksum = "45c3457aacde3c65315de5031ec191ce46604304d2446e803d71ade03308d970"
2954
  dependencies = [
2955
  "proc-macro2 1.0.64",
2956
  "quote 1.0.29",
 
3020
  dependencies = [
3021
  "proc-macro2 1.0.64",
3022
  "quote 1.0.29",
3023
+ "syn 2.0.26",
3024
  ]
3025
 
3026
  [[package]]
 
3175
  dependencies = [
3176
  "proc-macro2 1.0.64",
3177
  "quote 1.0.29",
3178
+ "syn 2.0.26",
3179
  ]
3180
 
3181
  [[package]]
 
3354
 
3355
  [[package]]
3356
  name = "unicode-ident"
3357
+ version = "1.0.11"
3358
  source = "registry+https://github.com/rust-lang/crates.io-index"
3359
+ checksum = "301abaae475aa91687eb82514b328ab47a211a533026cb25fc3e519b86adfc3c"
3360
 
3361
  [[package]]
3362
  name = "unicode-normalization"
 
3497
  "once_cell",
3498
  "proc-macro2 1.0.64",
3499
  "quote 1.0.29",
3500
+ "syn 2.0.26",
3501
  "wasm-bindgen-shared",
3502
  ]
3503
 
 
3531
  dependencies = [
3532
  "proc-macro2 1.0.64",
3533
  "quote 1.0.29",
3534
+ "syn 2.0.26",
3535
  "wasm-bindgen-backend",
3536
  "wasm-bindgen-shared",
3537
  ]
 
3554
 
3555
  [[package]]
3556
  name = "websurfx"
3557
+ version = "0.14.0"
3558
  dependencies = [
3559
  "actix-files",
3560
  "actix-web",
3561
+ "async-trait",
3562
  "criterion",
3563
  "env_logger",
3564
  "error-stack",
Cargo.toml CHANGED
@@ -1,6 +1,6 @@
1
  [package]
2
  name = "websurfx"
3
- version = "0.13.17"
4
  edition = "2021"
5
  description = "An open-source alternative to Searx that provides clean, ad-free, and organic results with incredible speed while keeping privacy and security in mind."
6
  repository = "https://github.com/neon-mmd/websurfx"
@@ -12,7 +12,7 @@ tokio = {version="*",features=["full"]}
12
  serde = {version="*",features=["derive"]}
13
  handlebars = { version = "4.3.6", features = ["dir_source"] }
14
  scraper = {version="*"}
15
- actix-web = {version="4.3.1"}
16
  actix-files = {version="0.6.2"}
17
  serde_json = {version="*"}
18
  fake-useragent = {version="*"}
@@ -24,6 +24,7 @@ md5 = {version="*"}
24
  rand={version="*"}
25
  once_cell = {version="*"}
26
  error-stack = {version="0.3.1"}
 
27
 
28
  [dev-dependencies]
29
  rusty-hook = "^0.11.2"
 
1
  [package]
2
  name = "websurfx"
3
+ version = "0.14.0"
4
  edition = "2021"
5
  description = "An open-source alternative to Searx that provides clean, ad-free, and organic results with incredible speed while keeping privacy and security in mind."
6
  repository = "https://github.com/neon-mmd/websurfx"
 
12
  serde = {version="*",features=["derive"]}
13
  handlebars = { version = "4.3.6", features = ["dir_source"] }
14
  scraper = {version="*"}
15
+ actix-web = {version="4.3.1", features = ["cookies"]}
16
  actix-files = {version="0.6.2"}
17
  serde_json = {version="*"}
18
  fake-useragent = {version="*"}
 
24
  rand={version="*"}
25
  once_cell = {version="*"}
26
  error-stack = {version="0.3.1"}
27
+ async-trait = {version="*"}
28
 
29
  [dev-dependencies]
30
  rusty-hook = "^0.11.2"
src/config/parser.rs CHANGED
@@ -3,7 +3,7 @@
3
 
4
  use super::parser_models::Style;
5
  use rlua::Lua;
6
- use std::{format, fs, path::Path};
7
 
8
  // ------- Constants --------
9
  static COMMON_DIRECTORY_NAME: &str = "websurfx";
@@ -18,6 +18,10 @@ static CONFIG_FILE_NAME: &str = "config.lua";
18
  /// * `style` - It stores the theming options for the website.
19
  /// * `redis_url` - It stores the redis connection url address on which the redis
20
  /// client should connect.
 
 
 
 
21
  #[derive(Clone)]
22
  pub struct Config {
23
  pub port: u16,
@@ -27,12 +31,17 @@ pub struct Config {
27
  pub aggregator: AggregatorConfig,
28
  pub logging: bool,
29
  pub debug: bool,
 
30
  }
31
 
32
  /// Configuration options for the aggregator.
 
 
 
 
 
33
  #[derive(Clone)]
34
  pub struct AggregatorConfig {
35
- /// Whether to introduce a random delay before sending the request to the search engine.
36
  pub random_delay: bool,
37
  }
38
 
@@ -66,6 +75,11 @@ impl Config {
66
  },
67
  logging: globals.get::<_, bool>("logging")?,
68
  debug: globals.get::<_, bool>("debug")?,
 
 
 
 
 
69
  })
70
  })
71
  }
 
3
 
4
  use super::parser_models::Style;
5
  use rlua::Lua;
6
+ use std::{collections::HashMap, format, fs, path::Path};
7
 
8
  // ------- Constants --------
9
  static COMMON_DIRECTORY_NAME: &str = "websurfx";
 
18
  /// * `style` - It stores the theming options for the website.
19
  /// * `redis_url` - It stores the redis connection url address on which the redis
20
  /// client should connect.
21
+ /// * `aggregator` - It stores the option to whether enable or disable production use.
22
+ /// * `logging` - It stores the option to whether enable or disable logs.
23
+ /// * `debug` - It stores the option to whether enable or disable debug mode.
24
+ /// * `upstream_search_engines` - It stores all the engine names that were enabled by the user.
25
  #[derive(Clone)]
26
  pub struct Config {
27
  pub port: u16,
 
31
  pub aggregator: AggregatorConfig,
32
  pub logging: bool,
33
  pub debug: bool,
34
+ pub upstream_search_engines: Vec<String>,
35
  }
36
 
37
  /// Configuration options for the aggregator.
38
+ ///
39
+ /// # Fields
40
+ ///
41
+ /// * `random_delay` - It stores the option to whether enable or disable random delays between
42
+ /// requests.
43
  #[derive(Clone)]
44
  pub struct AggregatorConfig {
 
45
  pub random_delay: bool,
46
  }
47
 
 
75
  },
76
  logging: globals.get::<_, bool>("logging")?,
77
  debug: globals.get::<_, bool>("debug")?,
78
+ upstream_search_engines: globals
79
+ .get::<_, HashMap<String, bool>>("upstream_search_engines")?
80
+ .into_iter()
81
+ .filter_map(|(key, value)| value.then_some(key))
82
+ .collect(),
83
  })
84
  })
85
  }
src/engines/duckduckgo.rs CHANGED
@@ -2,154 +2,150 @@
2
  //! by querying the upstream duckduckgo search engine with user provided query and with a page
3
  //! number if provided.
4
 
5
- use std::{collections::HashMap, time::Duration};
6
 
7
  use reqwest::header::{HeaderMap, CONTENT_TYPE, COOKIE, REFERER, USER_AGENT};
8
  use scraper::{Html, Selector};
9
 
10
  use crate::results::aggregation_models::RawSearchResult;
11
 
12
- use super::engine_models::EngineError;
13
 
14
  use error_stack::{IntoReport, Report, Result, ResultExt};
15
 
16
- /// This function scrapes results from the upstream engine duckduckgo and puts all the scraped
17
- /// results like title, visiting_url (href in html),engine (from which engine it was fetched from)
18
- /// and description in a RawSearchResult and then adds that to HashMap whose keys are url and
19
- /// values are RawSearchResult struct and then returns it within a Result enum.
20
- ///
21
- /// # Arguments
22
- ///
23
- /// * `query` - Takes the user provided query to query to the upstream search engine with.
24
- /// * `page` - Takes an u32 as an argument.
25
- /// * `user_agent` - Takes a random user agent string as an argument.
26
- ///
27
- /// # Errors
28
- ///
29
- /// Returns an `EngineErrorKind` if the user is not connected to the internet or if their is failure to
30
- /// reach the above `upstream search engine` page or if the `upstream search engine` is unable to
31
- /// provide results for the requested search query and also returns error if the scraping selector
32
- /// or HeaderMap fails to initialize.
33
- pub async fn results(
34
- query: &str,
35
- page: u32,
36
- user_agent: &str,
37
- ) -> Result<HashMap<String, RawSearchResult>, EngineError> {
38
- // Page number can be missing or empty string and so appropriate handling is required
39
- // so that upstream server receives valid page number.
40
- let url: String = match page {
41
- 1 => {
42
- format!("https://html.duckduckgo.com/html/?q={query}&s=&dc=&v=1&o=json&api=/d.js")
43
- }
44
- _ => {
45
- format!(
46
- "https://duckduckgo.com/html/?q={}&s={}&dc={}&v=1&o=json&api=/d.js",
47
- query,
48
- (page / 2 + (page % 2)) * 30,
49
- (page / 2 + (page % 2)) * 30 + 1
50
- )
51
- }
52
- };
53
 
54
- // initializing HeaderMap and adding appropriate headers.
55
- let mut header_map = HeaderMap::new();
56
- header_map.insert(
57
- USER_AGENT,
58
- user_agent
59
- .parse()
60
- .into_report()
61
- .change_context(EngineError::UnexpectedError)?,
62
- );
63
- header_map.insert(
64
- REFERER,
65
- "https://google.com/"
66
- .parse()
67
- .into_report()
68
- .change_context(EngineError::UnexpectedError)?,
69
- );
70
- header_map.insert(
71
- CONTENT_TYPE,
72
- "application/x-www-form-urlencoded"
73
- .parse()
74
- .into_report()
75
- .change_context(EngineError::UnexpectedError)?,
76
- );
77
- header_map.insert(
78
- COOKIE,
79
- "kl=wt-wt"
80
- .parse()
81
- .into_report()
82
- .change_context(EngineError::UnexpectedError)?,
83
- );
 
 
 
 
 
 
 
 
 
 
84
 
85
- // fetch the html from upstream duckduckgo engine
86
- let results: String = reqwest::Client::new()
87
- .get(url)
88
- .timeout(Duration::from_secs(5))
89
- .headers(header_map) // add spoofed headers to emulate human behavior
90
- .send()
91
- .await
92
- .into_report()
93
- .change_context(EngineError::RequestError)?
94
- .text()
95
- .await
96
- .into_report()
97
- .change_context(EngineError::RequestError)?;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
98
 
99
- let document: Html = Html::parse_document(&results);
 
 
100
 
101
- let no_result: Selector = Selector::parse(".no-results")
102
- .map_err(|_| Report::new(EngineError::UnexpectedError))
103
- .attach_printable_lazy(|| format!("invalid CSS selector: {}", ".no-results"))?;
104
 
105
- if document.select(&no_result).next().is_some() {
106
- return Err(Report::new(EngineError::EmptyResultSet));
107
- }
108
 
109
- let results: Selector = Selector::parse(".result")
110
- .map_err(|_| Report::new(EngineError::UnexpectedError))
111
- .attach_printable_lazy(|| format!("invalid CSS selector: {}", ".result"))?;
112
- let result_title: Selector = Selector::parse(".result__a")
113
- .map_err(|_| Report::new(EngineError::UnexpectedError))
114
- .attach_printable_lazy(|| format!("invalid CSS selector: {}", ".result__a"))?;
115
- let result_url: Selector = Selector::parse(".result__url")
116
- .map_err(|_| Report::new(EngineError::UnexpectedError))
117
- .attach_printable_lazy(|| format!("invalid CSS selector: {}", ".result__url"))?;
118
- let result_desc: Selector = Selector::parse(".result__snippet")
119
- .map_err(|_| Report::new(EngineError::UnexpectedError))
120
- .attach_printable_lazy(|| format!("invalid CSS selector: {}", ".result__snippet"))?;
121
 
122
- // scrape all the results from the html
123
- Ok(document
124
- .select(&results)
125
- .map(|result| {
126
- RawSearchResult::new(
127
- result
128
- .select(&result_title)
129
- .next()
130
- .unwrap()
131
- .inner_html()
132
- .trim()
133
- .to_string(),
134
- format!(
135
- "https://{}",
136
  result
137
- .select(&result_url)
138
  .next()
139
  .unwrap()
140
  .inner_html()
141
  .trim()
142
- ),
143
- result
144
- .select(&result_desc)
145
- .next()
146
- .unwrap()
147
- .inner_html()
148
- .trim()
149
- .to_string(),
150
- vec!["duckduckgo".to_string()],
151
- )
152
- })
153
- .map(|search_result| (search_result.visiting_url.clone(), search_result))
154
- .collect())
 
 
 
 
 
 
 
 
 
 
155
  }
 
2
  //! by querying the upstream duckduckgo search engine with user provided query and with a page
3
  //! number if provided.
4
 
5
+ use std::collections::HashMap;
6
 
7
  use reqwest::header::{HeaderMap, CONTENT_TYPE, COOKIE, REFERER, USER_AGENT};
8
  use scraper::{Html, Selector};
9
 
10
  use crate::results::aggregation_models::RawSearchResult;
11
 
12
+ use super::engine_models::{EngineError, SearchEngine};
13
 
14
  use error_stack::{IntoReport, Report, Result, ResultExt};
15
 
16
+ /// A new DuckDuckGo engine type defined in-order to implement the `SearchEngine` trait which allows to
17
+ /// reduce code duplication as well as allows to create vector of different search engines easily.
18
+ pub struct DuckDuckGo;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
 
20
+ #[async_trait::async_trait]
21
+ impl SearchEngine for DuckDuckGo {
22
+ /// This function scrapes results from the upstream engine duckduckgo and puts all the scraped
23
+ /// results like title, visiting_url (href in html),engine (from which engine it was fetched from)
24
+ /// and description in a RawSearchResult and then adds that to HashMap whose keys are url and
25
+ /// values are RawSearchResult struct and then returns it within a Result enum.
26
+ ///
27
+ /// # Arguments
28
+ ///
29
+ /// * `query` - Takes the user provided query to query to the upstream search engine with.
30
+ /// * `page` - Takes an u32 as an argument.
31
+ /// * `user_agent` - Takes a random user agent string as an argument.
32
+ ///
33
+ /// # Errors
34
+ ///
35
+ /// Returns an `EngineErrorKind` if the user is not connected to the internet or if their is failure to
36
+ /// reach the above `upstream search engine` page or if the `upstream search engine` is unable to
37
+ /// provide results for the requested search query and also returns error if the scraping selector
38
+ /// or HeaderMap fails to initialize.
39
+ async fn results(
40
+ &self,
41
+ query: String,
42
+ page: u32,
43
+ user_agent: String,
44
+ ) -> Result<HashMap<String, RawSearchResult>, EngineError> {
45
+ // Page number can be missing or empty string and so appropriate handling is required
46
+ // so that upstream server recieves valid page number.
47
+ let url: String = match page {
48
+ 1 => {
49
+ format!("https://html.duckduckgo.com/html/?q={query}&s=&dc=&v=1&o=json&api=/d.js")
50
+ }
51
+ _ => {
52
+ format!(
53
+ "https://duckduckgo.com/html/?q={}&s={}&dc={}&v=1&o=json&api=/d.js",
54
+ query,
55
+ (page / 2 + (page % 2)) * 30,
56
+ (page / 2 + (page % 2)) * 30 + 1
57
+ )
58
+ }
59
+ };
60
 
61
+ // initializing HeaderMap and adding appropriate headers.
62
+ let mut header_map = HeaderMap::new();
63
+ header_map.insert(
64
+ USER_AGENT,
65
+ user_agent
66
+ .parse()
67
+ .into_report()
68
+ .change_context(EngineError::UnexpectedError)?,
69
+ );
70
+ header_map.insert(
71
+ REFERER,
72
+ "https://google.com/"
73
+ .parse()
74
+ .into_report()
75
+ .change_context(EngineError::UnexpectedError)?,
76
+ );
77
+ header_map.insert(
78
+ CONTENT_TYPE,
79
+ "application/x-www-form-urlencoded"
80
+ .parse()
81
+ .into_report()
82
+ .change_context(EngineError::UnexpectedError)?,
83
+ );
84
+ header_map.insert(
85
+ COOKIE,
86
+ "kl=wt-wt"
87
+ .parse()
88
+ .into_report()
89
+ .change_context(EngineError::UnexpectedError)?,
90
+ );
91
 
92
+ let document: Html = Html::parse_document(
93
+ &DuckDuckGo::fetch_html_from_upstream(self, url, header_map).await?,
94
+ );
95
 
96
+ let no_result: Selector = Selector::parse(".no-results")
97
+ .map_err(|_| Report::new(EngineError::UnexpectedError))
98
+ .attach_printable_lazy(|| format!("invalid CSS selector: {}", ".no-results"))?;
99
 
100
+ if document.select(&no_result).next().is_some() {
101
+ return Err(Report::new(EngineError::EmptyResultSet));
102
+ }
103
 
104
+ let results: Selector = Selector::parse(".result")
105
+ .map_err(|_| Report::new(EngineError::UnexpectedError))
106
+ .attach_printable_lazy(|| format!("invalid CSS selector: {}", ".result"))?;
107
+ let result_title: Selector = Selector::parse(".result__a")
108
+ .map_err(|_| Report::new(EngineError::UnexpectedError))
109
+ .attach_printable_lazy(|| format!("invalid CSS selector: {}", ".result__a"))?;
110
+ let result_url: Selector = Selector::parse(".result__url")
111
+ .map_err(|_| Report::new(EngineError::UnexpectedError))
112
+ .attach_printable_lazy(|| format!("invalid CSS selector: {}", ".result__url"))?;
113
+ let result_desc: Selector = Selector::parse(".result__snippet")
114
+ .map_err(|_| Report::new(EngineError::UnexpectedError))
115
+ .attach_printable_lazy(|| format!("invalid CSS selector: {}", ".result__snippet"))?;
116
 
117
+ // scrape all the results from the html
118
+ Ok(document
119
+ .select(&results)
120
+ .map(|result| {
121
+ RawSearchResult::new(
 
 
 
 
 
 
 
 
 
122
  result
123
+ .select(&result_title)
124
  .next()
125
  .unwrap()
126
  .inner_html()
127
  .trim()
128
+ .to_string(),
129
+ format!(
130
+ "https://{}",
131
+ result
132
+ .select(&result_url)
133
+ .next()
134
+ .unwrap()
135
+ .inner_html()
136
+ .trim()
137
+ ),
138
+ result
139
+ .select(&result_desc)
140
+ .next()
141
+ .unwrap()
142
+ .inner_html()
143
+ .trim()
144
+ .to_string(),
145
+ vec!["duckduckgo".to_string()],
146
+ )
147
+ })
148
+ .map(|search_result| (search_result.visiting_url.clone(), search_result))
149
+ .collect())
150
+ }
151
  }
src/engines/engine_models.rs CHANGED
@@ -1,8 +1,9 @@
1
  //! This module provides the error enum to handle different errors associated while requesting data from
2
  //! the upstream search engines with the search query provided by the user.
3
 
4
- use error_stack::Context;
5
- use std::fmt;
 
6
 
7
  /// A custom error type used for handle engine associated errors.
8
  ///
@@ -40,4 +41,35 @@ impl fmt::Display for EngineError {
40
  }
41
  }
42
 
43
- impl Context for EngineError {}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  //! This module provides the error enum to handle different errors associated while requesting data from
2
  //! the upstream search engines with the search query provided by the user.
3
 
4
+ use crate::results::aggregation_models::RawSearchResult;
5
+ use error_stack::{IntoReport, Result, ResultExt};
6
+ use std::{collections::HashMap, fmt, time::Duration};
7
 
8
  /// A custom error type used for handle engine associated errors.
9
  ///
 
41
  }
42
  }
43
 
44
+ impl error_stack::Context for EngineError {}
45
+
46
+ /// A trait to define common behaviour for all search engines.
47
+ #[async_trait::async_trait]
48
+ pub trait SearchEngine {
49
+ async fn fetch_html_from_upstream(
50
+ &self,
51
+ url: String,
52
+ header_map: reqwest::header::HeaderMap,
53
+ ) -> Result<String, EngineError> {
54
+ // fetch the html from upstream search engine
55
+ Ok(reqwest::Client::new()
56
+ .get(url)
57
+ .timeout(Duration::from_secs(30)) // Add timeout to request to avoid DDOSing the server
58
+ .headers(header_map) // add spoofed headers to emulate human behaviour
59
+ .send()
60
+ .await
61
+ .into_report()
62
+ .change_context(EngineError::RequestError)?
63
+ .text()
64
+ .await
65
+ .into_report()
66
+ .change_context(EngineError::RequestError)?)
67
+ }
68
+
69
+ async fn results(
70
+ &self,
71
+ query: String,
72
+ page: u32,
73
+ user_agent: String,
74
+ ) -> Result<HashMap<String, RawSearchResult>, EngineError>;
75
+ }
src/engines/searx.rs CHANGED
@@ -8,131 +8,130 @@ use std::collections::HashMap;
8
 
9
  use crate::results::aggregation_models::RawSearchResult;
10
 
11
- use super::engine_models::EngineError;
12
  use error_stack::{IntoReport, Report, Result, ResultExt};
13
 
14
- /// This function scrapes results from the upstream engine duckduckgo and puts all the scraped
15
- /// results like title, visiting_url (href in html),engine (from which engine it was fetched from)
16
- /// and description in a RawSearchResult and then adds that to HashMap whose keys are url and
17
- /// values are RawSearchResult struct and then returns it within a Result enum.
18
- ///
19
- /// # Arguments
20
- ///
21
- /// * `query` - Takes the user provided query to query to the upstream search engine with.
22
- /// * `page` - Takes an u32 as an argument.
23
- /// * `user_agent` - Takes a random user agent string as an argument.
24
- ///
25
- /// # Errors
26
- ///
27
- /// Returns an `EngineErrorKind` if the user is not connected to the internet or if their is failure to
28
- /// reach the above `upstream search engine` page or if the `upstream search engine` is unable to
29
- /// provide results for the requested search query and also returns error if the scraping selector
30
- /// or HeaderMap fails to initialize.
31
- pub async fn results(
32
- query: &str,
33
- page: u32,
34
- user_agent: &str,
35
- ) -> Result<HashMap<String, RawSearchResult>, EngineError> {
36
- // Page number can be missing or empty string and so appropriate handling is required
37
- // so that upstream server recieves valid page number.
38
- let url: String = format!("https://searx.work/search?q={query}&pageno={page}");
39
 
40
- // initializing headers and adding appropriate headers.
41
- let mut header_map = HeaderMap::new();
42
- header_map.insert(
43
- USER_AGENT,
44
- user_agent
45
- .parse()
46
- .into_report()
47
- .change_context(EngineError::UnexpectedError)?,
48
- );
49
- header_map.insert(
50
- REFERER,
51
- "https://google.com/"
52
- .parse()
53
- .into_report()
54
- .change_context(EngineError::UnexpectedError)?,
55
- );
56
- header_map.insert(
57
- CONTENT_TYPE,
58
- "application/x-www-form-urlencoded"
59
- .parse()
60
- .into_report()
61
- .change_context(EngineError::UnexpectedError)?,
62
- );
63
- header_map.insert(COOKIE, "categories=general; language=auto; locale=en; autocomplete=duckduckgo; image_proxy=1; method=POST; safesearch=2; theme=simple; results_on_new_tab=1; doi_resolver=oadoi.org; simple_style=auto; center_alignment=1; query_in_title=1; infinite_scroll=0; disabled_engines=; enabled_engines=\"archive is__general\\054yep__general\\054curlie__general\\054currency__general\\054ddg definitions__general\\054wikidata__general\\054duckduckgo__general\\054tineye__general\\054lingva__general\\054startpage__general\\054yahoo__general\\054wiby__general\\054marginalia__general\\054alexandria__general\\054wikibooks__general\\054wikiquote__general\\054wikisource__general\\054wikiversity__general\\054wikivoyage__general\\054dictzone__general\\054seznam__general\\054mojeek__general\\054naver__general\\054wikimini__general\\054brave__general\\054petalsearch__general\\054goo__general\"; disabled_plugins=; enabled_plugins=\"searx.plugins.hostname_replace\\054searx.plugins.oa_doi_rewrite\\054searx.plugins.vim_hotkeys\"; tokens=; maintab=on; enginetab=on".parse().into_report().change_context(EngineError::UnexpectedError)?);
64
 
65
- // fetch the html from upstream searx instance engine
66
- let results: String = reqwest::Client::new()
67
- .get(url)
68
- .headers(header_map) // add spoofed headers to emulate human behaviours.
69
- .send()
70
- .await
71
- .into_report()
72
- .change_context(EngineError::RequestError)?
73
- .text()
74
- .await
75
- .into_report()
76
- .change_context(EngineError::RequestError)?;
77
 
78
- let document: Html = Html::parse_document(&results);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79
 
80
- let no_result: Selector = Selector::parse("#urls>.dialog-error>p")
81
- .map_err(|_| Report::new(EngineError::UnexpectedError))
82
- .attach_printable_lazy(|| format!("invalid CSS selector: {}", "#urls>.dialog-error>p"))?;
83
 
84
- if let Some(no_result_msg) = document.select(&no_result).nth(1) {
85
- if no_result_msg.inner_html()
 
 
 
 
 
 
86
  == "we didn't find any results. Please use another query or search in more categories"
87
  {
88
  return Err(Report::new(EngineError::EmptyResultSet));
89
  }
90
- }
91
 
92
- let results: Selector = Selector::parse(".result")
93
- .map_err(|_| Report::new(EngineError::UnexpectedError))
94
- .attach_printable_lazy(|| format!("invalid CSS selector: {}", ".result"))?;
95
- let result_title: Selector = Selector::parse("h3>a")
96
- .map_err(|_| Report::new(EngineError::UnexpectedError))
97
- .attach_printable_lazy(|| format!("invalid CSS selector: {}", "h3>a"))?;
98
- let result_url: Selector = Selector::parse("h3>a")
99
- .map_err(|_| Report::new(EngineError::UnexpectedError))
100
- .attach_printable_lazy(|| format!("invalid CSS selector: {}", "h3>a"))?;
101
 
102
- let result_desc: Selector = Selector::parse(".content")
103
- .map_err(|_| Report::new(EngineError::UnexpectedError))
104
- .attach_printable_lazy(|| format!("invalid CSS selector: {}", ".content"))?;
105
 
106
- // scrape all the results from the html
107
- Ok(document
108
- .select(&results)
109
- .map(|result| {
110
- RawSearchResult::new(
111
- result
112
- .select(&result_title)
113
- .next()
114
- .unwrap()
115
- .inner_html()
116
- .trim()
117
- .to_string(),
118
- result
119
- .select(&result_url)
120
- .next()
121
- .unwrap()
122
- .value()
123
- .attr("href")
124
- .unwrap()
125
- .to_string(),
126
- result
127
- .select(&result_desc)
128
- .next()
129
- .unwrap()
130
- .inner_html()
131
- .trim()
132
- .to_string(),
133
- vec!["searx".to_string()],
134
- )
135
- })
136
- .map(|search_result| (search_result.visiting_url.clone(), search_result))
137
- .collect())
 
138
  }
 
8
 
9
  use crate::results::aggregation_models::RawSearchResult;
10
 
11
+ use super::engine_models::{EngineError, SearchEngine};
12
  use error_stack::{IntoReport, Report, Result, ResultExt};
13
 
14
+ /// A new Searx engine type defined in-order to implement the `SearchEngine` trait which allows to
15
+ /// reduce code duplication as well as allows to create vector of different search engines easily.
16
+ pub struct Searx;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
 
18
+ #[async_trait::async_trait]
19
+ impl SearchEngine for Searx {
20
+ /// This function scrapes results from the upstream engine duckduckgo and puts all the scraped
21
+ /// results like title, visiting_url (href in html),engine (from which engine it was fetched from)
22
+ /// and description in a RawSearchResult and then adds that to HashMap whose keys are url and
23
+ /// values are RawSearchResult struct and then returns it within a Result enum.
24
+ ///
25
+ /// # Arguments
26
+ ///
27
+ /// * `query` - Takes the user provided query to query to the upstream search engine with.
28
+ /// * `page` - Takes an u32 as an argument.
29
+ /// * `user_agent` - Takes a random user agent string as an argument.
30
+ ///
31
+ /// # Errors
32
+ ///
33
+ /// Returns an `EngineErrorKind` if the user is not connected to the internet or if their is failure to
34
+ /// reach the above `upstream search engine` page or if the `upstream search engine` is unable to
35
+ /// provide results for the requested search query and also returns error if the scraping selector
36
+ /// or HeaderMap fails to initialize.
 
 
 
 
 
37
 
38
+ async fn results(
39
+ &self,
40
+ query: String,
41
+ page: u32,
42
+ user_agent: String,
43
+ ) -> Result<HashMap<String, RawSearchResult>, EngineError> {
44
+ // Page number can be missing or empty string and so appropriate handling is required
45
+ // so that upstream server recieves valid page number.
46
+ let url: String = format!("https://searx.work/search?q={query}&pageno={page}");
 
 
 
47
 
48
+ // initializing headers and adding appropriate headers.
49
+ let mut header_map = HeaderMap::new();
50
+ header_map.insert(
51
+ USER_AGENT,
52
+ user_agent
53
+ .parse()
54
+ .into_report()
55
+ .change_context(EngineError::UnexpectedError)?,
56
+ );
57
+ header_map.insert(
58
+ REFERER,
59
+ "https://google.com/"
60
+ .parse()
61
+ .into_report()
62
+ .change_context(EngineError::UnexpectedError)?,
63
+ );
64
+ header_map.insert(
65
+ CONTENT_TYPE,
66
+ "application/x-www-form-urlencoded"
67
+ .parse()
68
+ .into_report()
69
+ .change_context(EngineError::UnexpectedError)?,
70
+ );
71
+ header_map.insert(COOKIE, "categories=general; language=auto; locale=en; autocomplete=duckduckgo; image_proxy=1; method=POST; safesearch=2; theme=simple; results_on_new_tab=1; doi_resolver=oadoi.org; simple_style=auto; center_alignment=1; query_in_title=1; infinite_scroll=0; disabled_engines=; enabled_engines=\"archive is__general\\054yep__general\\054curlie__general\\054currency__general\\054ddg definitions__general\\054wikidata__general\\054duckduckgo__general\\054tineye__general\\054lingva__general\\054startpage__general\\054yahoo__general\\054wiby__general\\054marginalia__general\\054alexandria__general\\054wikibooks__general\\054wikiquote__general\\054wikisource__general\\054wikiversity__general\\054wikivoyage__general\\054dictzone__general\\054seznam__general\\054mojeek__general\\054naver__general\\054wikimini__general\\054brave__general\\054petalsearch__general\\054goo__general\"; disabled_plugins=; enabled_plugins=\"searx.plugins.hostname_replace\\054searx.plugins.oa_doi_rewrite\\054searx.plugins.vim_hotkeys\"; tokens=; maintab=on; enginetab=on".parse().into_report().change_context(EngineError::UnexpectedError)?);
72
 
73
+ let document: Html =
74
+ Html::parse_document(&Searx::fetch_html_from_upstream(self, url, header_map).await?);
 
75
 
76
+ let no_result: Selector = Selector::parse("#urls>.dialog-error>p")
77
+ .map_err(|_| Report::new(EngineError::UnexpectedError))
78
+ .attach_printable_lazy(|| {
79
+ format!("invalid CSS selector: {}", "#urls>.dialog-error>p")
80
+ })?;
81
+
82
+ if let Some(no_result_msg) = document.select(&no_result).nth(1) {
83
+ if no_result_msg.inner_html()
84
  == "we didn't find any results. Please use another query or search in more categories"
85
  {
86
  return Err(Report::new(EngineError::EmptyResultSet));
87
  }
88
+ }
89
 
90
+ let results: Selector = Selector::parse(".result")
91
+ .map_err(|_| Report::new(EngineError::UnexpectedError))
92
+ .attach_printable_lazy(|| format!("invalid CSS selector: {}", ".result"))?;
93
+ let result_title: Selector = Selector::parse("h3>a")
94
+ .map_err(|_| Report::new(EngineError::UnexpectedError))
95
+ .attach_printable_lazy(|| format!("invalid CSS selector: {}", "h3>a"))?;
96
+ let result_url: Selector = Selector::parse("h3>a")
97
+ .map_err(|_| Report::new(EngineError::UnexpectedError))
98
+ .attach_printable_lazy(|| format!("invalid CSS selector: {}", "h3>a"))?;
99
 
100
+ let result_desc: Selector = Selector::parse(".content")
101
+ .map_err(|_| Report::new(EngineError::UnexpectedError))
102
+ .attach_printable_lazy(|| format!("invalid CSS selector: {}", ".content"))?;
103
 
104
+ // scrape all the results from the html
105
+ Ok(document
106
+ .select(&results)
107
+ .map(|result| {
108
+ RawSearchResult::new(
109
+ result
110
+ .select(&result_title)
111
+ .next()
112
+ .unwrap()
113
+ .inner_html()
114
+ .trim()
115
+ .to_string(),
116
+ result
117
+ .select(&result_url)
118
+ .next()
119
+ .unwrap()
120
+ .value()
121
+ .attr("href")
122
+ .unwrap()
123
+ .to_string(),
124
+ result
125
+ .select(&result_desc)
126
+ .next()
127
+ .unwrap()
128
+ .inner_html()
129
+ .trim()
130
+ .to_string(),
131
+ vec!["searx".to_string()],
132
+ )
133
+ })
134
+ .map(|search_result| (search_result.visiting_url.clone(), search_result))
135
+ .collect())
136
+ }
137
  }
src/results/aggregation_models.rs CHANGED
@@ -3,7 +3,7 @@
3
 
4
  use serde::{Deserialize, Serialize};
5
 
6
- use crate::config::parser_models::Style;
7
 
8
  /// A named struct to store, serialize and deserializes the individual search result from all the
9
  /// scraped and aggregated search results from the upstream search engines.
@@ -16,7 +16,7 @@ use crate::config::parser_models::Style;
16
  /// * `url` - The url to be displayed below the search result title in html.
17
  /// * `description` - The description of the search result.
18
  /// * `engine` - The names of the upstream engines from which this results were provided.
19
- #[derive(Debug, Serialize, Deserialize)]
20
  #[serde(rename_all = "camelCase")]
21
  pub struct SearchResult {
22
  pub title: String,
@@ -116,6 +116,25 @@ impl RawSearchResult {
116
  }
117
  }
118
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
119
  /// A named struct to store, serialize, deserialize the all the search results scraped and
120
  /// aggregated from the upstream search engines.
121
  ///
@@ -124,12 +143,18 @@ impl RawSearchResult {
124
  /// * `results` - Stores the individual serializable `SearchResult` struct into a vector of
125
  /// `SearchResult` structs.
126
  /// * `page_query` - Stores the current pages search query `q` provided in the search url.
 
 
 
 
 
127
  #[derive(Serialize, Deserialize)]
128
  #[serde(rename_all = "camelCase")]
129
  pub struct SearchResults {
130
  pub results: Vec<SearchResult>,
131
  pub page_query: String,
132
  pub style: Style,
 
133
  }
134
 
135
  impl SearchResults {
@@ -141,14 +166,22 @@ impl SearchResults {
141
  /// and stores it into a vector of `SearchResult` structs.
142
  /// * `page_query` - Takes an argument of current page`s search query `q` provided in
143
  /// the search url.
144
- pub fn new(results: Vec<SearchResult>, page_query: String) -> Self {
 
 
 
 
 
 
145
  SearchResults {
146
  results,
147
  page_query,
148
  style: Style::new("".to_string(), "".to_string()),
 
149
  }
150
  }
151
 
 
152
  pub fn add_style(&mut self, style: Style) {
153
  self.style = style;
154
  }
 
3
 
4
  use serde::{Deserialize, Serialize};
5
 
6
+ use crate::{config::parser_models::Style, engines::engine_models::EngineError};
7
 
8
  /// A named struct to store, serialize and deserializes the individual search result from all the
9
  /// scraped and aggregated search results from the upstream search engines.
 
16
  /// * `url` - The url to be displayed below the search result title in html.
17
  /// * `description` - The description of the search result.
18
  /// * `engine` - The names of the upstream engines from which this results were provided.
19
+ #[derive(Serialize, Deserialize)]
20
  #[serde(rename_all = "camelCase")]
21
  pub struct SearchResult {
22
  pub title: String,
 
116
  }
117
  }
118
 
119
+ #[derive(Serialize, Deserialize)]
120
+ pub struct EngineErrorInfo {
121
+ pub error: String,
122
+ pub engine: String,
123
+ }
124
+
125
+ impl EngineErrorInfo {
126
+ pub fn new(error: &EngineError, engine: String) -> Self {
127
+ Self {
128
+ error: match error {
129
+ EngineError::RequestError => String::from("RequestError"),
130
+ EngineError::EmptyResultSet => String::from("EmptyResultSet"),
131
+ EngineError::UnexpectedError => String::from("UnexpectedError"),
132
+ },
133
+ engine,
134
+ }
135
+ }
136
+ }
137
+
138
  /// A named struct to store, serialize, deserialize the all the search results scraped and
139
  /// aggregated from the upstream search engines.
140
  ///
 
143
  /// * `results` - Stores the individual serializable `SearchResult` struct into a vector of
144
  /// `SearchResult` structs.
145
  /// * `page_query` - Stores the current pages search query `q` provided in the search url.
146
+ /// * `style` - Stores the theming options for the website.
147
+ /// * `engine_errors_info` - Stores the information on which engines failed with their engine name
148
+ /// and the type of error that caused it.
149
+ /// * `empty_result_set` - Stores a boolean which indicates that no engines gave a result for the
150
+ /// given search query.
151
  #[derive(Serialize, Deserialize)]
152
  #[serde(rename_all = "camelCase")]
153
  pub struct SearchResults {
154
  pub results: Vec<SearchResult>,
155
  pub page_query: String,
156
  pub style: Style,
157
+ pub engine_errors_info: Vec<EngineErrorInfo>,
158
  }
159
 
160
  impl SearchResults {
 
166
  /// and stores it into a vector of `SearchResult` structs.
167
  /// * `page_query` - Takes an argument of current page`s search query `q` provided in
168
  /// the search url.
169
+ /// * `empty_result_set` - Takes a boolean which indicates that no engines gave a result for the
170
+ /// given search query.
171
+ pub fn new(
172
+ results: Vec<SearchResult>,
173
+ page_query: String,
174
+ engine_errors_info: Vec<EngineErrorInfo>,
175
+ ) -> Self {
176
  SearchResults {
177
  results,
178
  page_query,
179
  style: Style::new("".to_string(), "".to_string()),
180
+ engine_errors_info,
181
  }
182
  }
183
 
184
+ /// A setter function to add website style to the return search results.
185
  pub fn add_style(&mut self, style: Style) {
186
  self.style = style;
187
  }
src/results/aggregator.rs CHANGED
@@ -3,22 +3,41 @@
3
 
4
  use std::{collections::HashMap, time::Duration};
5
 
 
6
  use rand::Rng;
7
- use tokio::join;
8
 
9
  use super::{
10
- aggregation_models::{RawSearchResult, SearchResult, SearchResults},
11
  user_agent::random_user_agent,
12
  };
13
 
14
- use crate::engines::{duckduckgo, searx};
 
 
 
 
 
 
 
15
 
16
- /// A function that aggregates all the scraped results from the above upstream engines and
17
- /// then removes duplicate results and if two results are found to be from two or more engines
18
- /// then puts their names together to show the results are fetched from these upstream engines
19
- /// and then removes all data from the HashMap and puts into a struct of all results aggregated
20
- /// into a vector and also adds the query used into the struct this is necessary because
21
- /// otherwise the search bar in search remains empty if searched from the query url
 
 
 
 
 
 
 
 
 
 
 
22
  ///
23
  /// # Example:
24
  ///
@@ -30,6 +49,9 @@ use crate::engines::{duckduckgo, searx};
30
  /// * `query` - Accepts a string to query with the above upstream search engines.
31
  /// * `page` - Accepts an u32 page number.
32
  /// * `random_delay` - Accepts a boolean value to add a random delay before making the request.
 
 
 
33
  ///
34
  /// # Error
35
  ///
@@ -37,10 +59,11 @@ use crate::engines::{duckduckgo, searx};
37
  /// function in either `searx` or `duckduckgo` or both otherwise returns a `SearchResults struct`
38
  /// containing appropriate values.
39
  pub async fn aggregate(
40
- query: &str,
41
  page: u32,
42
  random_delay: bool,
43
  debug: bool,
 
44
  ) -> Result<SearchResults, Box<dyn std::error::Error>> {
45
  let user_agent: String = random_user_agent();
46
  let mut result_map: HashMap<String, RawSearchResult> = HashMap::new();
@@ -53,41 +76,106 @@ pub async fn aggregate(
53
  }
54
 
55
  // fetch results from upstream search engines simultaneously/concurrently.
56
- let (ddg_map_results, searx_map_results) = join!(
57
- duckduckgo::results(query, page, &user_agent),
58
- searx::results(query, page, &user_agent)
59
- );
 
 
 
 
60
 
61
- let ddg_map_results = ddg_map_results.unwrap_or_else(|e| {
62
- if debug {
63
- log::error!("Error fetching results from DuckDuckGo: {:?}", e);
64
- }
65
- HashMap::new()
66
- });
 
 
 
 
 
 
67
 
68
- let searx_map_results = searx_map_results.unwrap_or_else(|e| {
69
- if debug {
70
- log::error!("Error fetching results from Searx: {:?}", e);
 
 
71
  }
72
- HashMap::new()
73
- });
74
 
75
- result_map.extend(ddg_map_results);
76
 
77
- searx_map_results.into_iter().for_each(|(key, value)| {
78
- result_map
79
- .entry(key)
80
- .and_modify(|result| {
81
- result.add_engines(value.clone().engine());
82
- })
83
- .or_insert_with(|| -> RawSearchResult {
84
- RawSearchResult::new(
85
- value.title.clone(),
86
- value.visiting_url.clone(),
87
- value.description.clone(),
88
- value.engine.clone(),
89
- )
90
- });
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91
  });
92
 
93
  Ok(SearchResults::new(
@@ -104,5 +192,6 @@ pub async fn aggregate(
104
  })
105
  .collect(),
106
  query.to_string(),
 
107
  ))
108
  }
 
3
 
4
  use std::{collections::HashMap, time::Duration};
5
 
6
+ use error_stack::Report;
7
  use rand::Rng;
8
+ use tokio::task::JoinHandle;
9
 
10
  use super::{
11
+ aggregation_models::{EngineErrorInfo, RawSearchResult, SearchResult, SearchResults},
12
  user_agent::random_user_agent,
13
  };
14
 
15
+ use crate::engines::{
16
+ duckduckgo,
17
+ engine_models::{EngineError, SearchEngine},
18
+ searx,
19
+ };
20
+
21
+ /// Aliases for long type annotations
22
+ type FutureVec = Vec<JoinHandle<Result<HashMap<String, RawSearchResult>, Report<EngineError>>>>;
23
 
24
+ /// The function aggregates the scraped results from the user-selected upstream search engines.
25
+ /// These engines can be chosen either from the user interface (UI) or from the configuration file.
26
+ /// The code handles this process by matching the selected search engines and adding them to a vector.
27
+ /// This vector is then used to create an asynchronous task vector using `tokio::spawn`, which returns
28
+ /// a future. This future is awaited in another loop. Once the results are collected, they are filtered
29
+ /// to remove any errors and ensure only proper results are included. If an error is encountered, it is
30
+ /// sent to the UI along with the name of the engine and the type of error. This information is finally
31
+ /// placed in the returned `SearchResults` struct.
32
+ ///
33
+ /// Additionally, the function eliminates duplicate results. If two results are identified as coming from
34
+ /// multiple engines, their names are combined to indicate that the results were fetched from these upstream
35
+ /// engines. After this, all the data in the `HashMap` is removed and placed into a struct that contains all
36
+ /// the aggregated results in a vector. Furthermore, the query used is also added to the struct. This step is
37
+ /// necessary to ensure that the search bar in the search remains populated even when searched from the query URL.
38
+ ///
39
+ /// Overall, this function serves to aggregate scraped results from user-selected search engines, handling errors,
40
+ /// removing duplicates, and organizing the data for display in the UI.
41
  ///
42
  /// # Example:
43
  ///
 
49
  /// * `query` - Accepts a string to query with the above upstream search engines.
50
  /// * `page` - Accepts an u32 page number.
51
  /// * `random_delay` - Accepts a boolean value to add a random delay before making the request.
52
+ /// * `debug` - Accepts a boolean value to enable or disable debug mode option.
53
+ /// * `upstream_search_engines` - Accepts a vector of search engine names which was selected by the
54
+ /// user through the UI or the config file.
55
  ///
56
  /// # Error
57
  ///
 
59
  /// function in either `searx` or `duckduckgo` or both otherwise returns a `SearchResults struct`
60
  /// containing appropriate values.
61
  pub async fn aggregate(
62
+ query: String,
63
  page: u32,
64
  random_delay: bool,
65
  debug: bool,
66
+ upstream_search_engines: Vec<String>,
67
  ) -> Result<SearchResults, Box<dyn std::error::Error>> {
68
  let user_agent: String = random_user_agent();
69
  let mut result_map: HashMap<String, RawSearchResult> = HashMap::new();
 
76
  }
77
 
78
  // fetch results from upstream search engines simultaneously/concurrently.
79
+ let search_engines: Vec<Box<dyn SearchEngine + Send + Sync>> = upstream_search_engines
80
+ .iter()
81
+ .map(|engine| match engine.to_lowercase().as_str() {
82
+ "duckduckgo" => Box::new(duckduckgo::DuckDuckGo) as Box<dyn SearchEngine + Send + Sync>,
83
+ "searx" => Box::new(searx::Searx) as Box<dyn SearchEngine + Send + Sync>,
84
+ &_ => panic!("Config Error: Incorrect config file option provided"),
85
+ })
86
+ .collect();
87
 
88
+ let task_capacity: usize = search_engines.len();
89
+
90
+ let tasks: FutureVec = search_engines
91
+ .into_iter()
92
+ .map(|search_engine| {
93
+ let query: String = query.clone();
94
+ let user_agent: String = user_agent.clone();
95
+ tokio::spawn(
96
+ async move { search_engine.results(query, page, user_agent.clone()).await },
97
+ )
98
+ })
99
+ .collect();
100
 
101
+ let mut outputs = Vec::with_capacity(task_capacity);
102
+
103
+ for task in tasks {
104
+ if let Ok(result) = task.await {
105
+ outputs.push(result)
106
  }
107
+ }
 
108
 
109
+ let mut engine_errors_info: Vec<EngineErrorInfo> = Vec::new();
110
 
111
+ // The code block `outputs.iter()` determines whether it is the first time the code is being run.
112
+ // It does this by checking the initial flag. If it is the first time, the code selects the first
113
+ // engine from which results are fetched and adds or extends them into the `result_map`. If the
114
+ // initially selected engine fails, the code automatically selects another engine to map or extend
115
+ // into the `result_map`. On the other hand, if an engine selected for the first time successfully
116
+ // fetches results and maps them into the `result_map`, the initial flag is set to false. Subsequently,
117
+ // the code iterates through the remaining engines one by one. It compares the fetched results from each
118
+ // engine with the results already present in the `result_map` to identify any duplicates. If duplicate
119
+ // results are found, the code groups them together with the name of the engine from which they were
120
+ // fetched, and automatically removes the duplicate results from the newly fetched data.
121
+ //
122
+ // Additionally, the code handles errors returned by the engines. It keeps track of which engines
123
+ // encountered errors and stores this information in a vector of structures called `EngineErrorInfo`.
124
+ // Each structure in this vector contains the name of the engine and the type of error it returned.
125
+ // These structures will later be added to the final `SearchResults` structure. The `SearchResults`
126
+ // structure is used to display an error box in the UI containing the relevant information from
127
+ // the `EngineErrorInfo` structure.
128
+ //
129
+ // In summary, this code block manages the selection of engines, handling of duplicate results, and tracking
130
+ // of errors in order to populate the `result_map` and provide informative feedback to the user through the
131
+ // `SearchResults` structure.
132
+ let mut initial: bool = true;
133
+ let mut counter: usize = 0;
134
+ outputs.iter().for_each(|results| {
135
+ if initial {
136
+ match results {
137
+ Ok(result) => {
138
+ result_map.extend(result.clone());
139
+ counter += 1;
140
+ initial = false
141
+ }
142
+ Err(error_type) => {
143
+ engine_errors_info.push(EngineErrorInfo::new(
144
+ error_type.downcast_ref::<EngineError>().unwrap(),
145
+ upstream_search_engines[counter].clone(),
146
+ ));
147
+ counter += 1
148
+ }
149
+ }
150
+ } else {
151
+ match results {
152
+ Ok(result) => {
153
+ result.clone().into_iter().for_each(|(key, value)| {
154
+ result_map
155
+ .entry(key)
156
+ .and_modify(|result| {
157
+ result.add_engines(value.clone().engine());
158
+ })
159
+ .or_insert_with(|| -> RawSearchResult {
160
+ RawSearchResult::new(
161
+ value.title.clone(),
162
+ value.visiting_url.clone(),
163
+ value.description.clone(),
164
+ value.engine.clone(),
165
+ )
166
+ });
167
+ });
168
+ counter += 1
169
+ }
170
+ Err(error_type) => {
171
+ engine_errors_info.push(EngineErrorInfo::new(
172
+ error_type.downcast_ref::<EngineError>().unwrap(),
173
+ upstream_search_engines[counter].clone(),
174
+ ));
175
+ counter += 1
176
+ }
177
+ }
178
+ }
179
  });
180
 
181
  Ok(SearchResults::new(
 
192
  })
193
  .collect(),
194
  query.to_string(),
195
+ engine_errors_info,
196
  ))
197
  }
src/server/routes.rs CHANGED
@@ -22,7 +22,7 @@ use serde::Deserialize;
22
  /// of the search url.
23
  /// * `page` - It stores the search parameter `page` (or pageno in simple words)
24
  /// of the search url.
25
- #[derive(Debug, Deserialize)]
26
  struct SearchParams {
27
  q: Option<String>,
28
  page: Option<u32>,
@@ -51,6 +51,21 @@ pub async fn not_found(
51
  .body(page_content))
52
  }
53
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
  /// Handles the route of search page of the `websurfx` meta search engine website and it takes
55
  /// two search url parameters `q` and `page` where `page` parameter is optional.
56
  ///
@@ -72,7 +87,6 @@ pub async fn search(
72
  config: web::Data<Config>,
73
  ) -> Result<HttpResponse, Box<dyn std::error::Error>> {
74
  let params = web::Query::<SearchParams>::from_query(req.query_string())?;
75
-
76
  match &params.q {
77
  Some(query) => {
78
  if query.trim().is_empty() {
@@ -89,7 +103,7 @@ pub async fn search(
89
  "http://{}:{}/search?q={}&page={}",
90
  config.binding_ip, config.port, query, page
91
  );
92
- let results_json = results(url, &config, query, page).await?;
93
  let page_content: String = hbs.render("search", &results_json)?;
94
  Ok(HttpResponse::Ok().body(page_content))
95
  }
@@ -104,23 +118,51 @@ pub async fn search(
104
  async fn results(
105
  url: String,
106
  config: &Config,
107
- query: &str,
108
  page: u32,
 
109
  ) -> Result<SearchResults, Box<dyn std::error::Error>> {
110
  //Initialize redis cache connection struct
111
  let mut redis_cache = RedisCache::new(config.redis_url.clone())?;
112
  // fetch the cached results json.
113
  let cached_results_json = redis_cache.cached_json(&url);
114
- // check if fetched results was indeed fetched or it was an error and if so
115
  // handle the data accordingly.
116
  match cached_results_json {
117
- Ok(results_json) => Ok(serde_json::from_str::<SearchResults>(&results_json).unwrap()),
118
  Err(_) => {
119
- let mut results_json: crate::results::aggregation_models::SearchResults =
120
- aggregate(query, page, config.aggregator.random_delay, config.debug).await?;
121
- results_json.add_style(config.style.clone());
122
- redis_cache.cache_results(serde_json::to_string(&results_json)?, &url)?;
123
- Ok(results_json)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
124
  }
125
  }
126
  }
 
22
  /// of the search url.
23
  /// * `page` - It stores the search parameter `page` (or pageno in simple words)
24
  /// of the search url.
25
+ #[derive(Deserialize)]
26
  struct SearchParams {
27
  q: Option<String>,
28
  page: Option<u32>,
 
51
  .body(page_content))
52
  }
53
 
54
+ /// A named struct which is used to deserialize the cookies fetched from the client side.
55
+ ///
56
+ /// # Fields
57
+ ///
58
+ /// * `theme` - It stores the theme name used in the website.
59
+ /// * `colorscheme` - It stores the colorscheme name used for the website theme.
60
+ /// * `engines` - It stores the user selected upstream search engines selected from the UI.
61
+ #[allow(dead_code)]
62
+ #[derive(Deserialize)]
63
+ struct Cookie {
64
+ theme: String,
65
+ colorscheme: String,
66
+ engines: Vec<String>,
67
+ }
68
+
69
  /// Handles the route of search page of the `websurfx` meta search engine website and it takes
70
  /// two search url parameters `q` and `page` where `page` parameter is optional.
71
  ///
 
87
  config: web::Data<Config>,
88
  ) -> Result<HttpResponse, Box<dyn std::error::Error>> {
89
  let params = web::Query::<SearchParams>::from_query(req.query_string())?;
 
90
  match &params.q {
91
  Some(query) => {
92
  if query.trim().is_empty() {
 
103
  "http://{}:{}/search?q={}&page={}",
104
  config.binding_ip, config.port, query, page
105
  );
106
+ let results_json = results(url, &config, query.to_string(), page, req).await?;
107
  let page_content: String = hbs.render("search", &results_json)?;
108
  Ok(HttpResponse::Ok().body(page_content))
109
  }
 
118
  async fn results(
119
  url: String,
120
  config: &Config,
121
+ query: String,
122
  page: u32,
123
+ req: HttpRequest,
124
  ) -> Result<SearchResults, Box<dyn std::error::Error>> {
125
  //Initialize redis cache connection struct
126
  let mut redis_cache = RedisCache::new(config.redis_url.clone())?;
127
  // fetch the cached results json.
128
  let cached_results_json = redis_cache.cached_json(&url);
129
+ // check if fetched cache results was indeed fetched or it was an error and if so
130
  // handle the data accordingly.
131
  match cached_results_json {
132
+ Ok(results) => Ok(serde_json::from_str::<SearchResults>(&results).unwrap()),
133
  Err(_) => {
134
+ // check if the cookie value is empty or not if it is empty then use the
135
+ // default selected upstream search engines from the config file otherwise
136
+ // parse the non-empty cookie and grab the user selected engines from the
137
+ // UI and use that.
138
+ let mut results: crate::results::aggregation_models::SearchResults = match req
139
+ .cookie("appCookie")
140
+ {
141
+ Some(cookie_value) => {
142
+ let cookie_value: Cookie = serde_json::from_str(cookie_value.name_value().1)?;
143
+ aggregate(
144
+ query,
145
+ page,
146
+ config.aggregator.random_delay,
147
+ config.debug,
148
+ cookie_value.engines,
149
+ )
150
+ .await?
151
+ }
152
+ None => {
153
+ aggregate(
154
+ query,
155
+ page,
156
+ config.aggregator.random_delay,
157
+ config.debug,
158
+ config.upstream_search_engines.clone(),
159
+ )
160
+ .await?
161
+ }
162
+ };
163
+ results.add_style(config.style.clone());
164
+ redis_cache.cache_results(serde_json::to_string(&results)?, &url)?;
165
+ Ok(results)
166
  }
167
  }
168
  }
websurfx/config.lua CHANGED
@@ -5,7 +5,7 @@ debug = false -- an option to enable or disable debug mode.
5
  -- ### Server ###
6
  port = "8080" -- port on which server should be launched
7
  binding_ip = "127.0.0.1" --ip address on the which server should be launched.
8
- production_use = false -- whether to use production mode or not (in other words this option should be used if it is to be used to host it on the server to provide a service to a large number of users)
9
  -- if production_use is set to true
10
  -- There will be a random delay before sending the request to the search engines, this is to prevent DDoSing the upstream search engines from a large number of simultaneous requests.
11
 
@@ -26,3 +26,6 @@ theme = "simple" -- the theme name which should be used for the website
26
 
27
  -- ### Caching ###
28
  redis_url = "redis://127.0.0.1:8082" -- redis connection url address on which the client should connect on.
 
 
 
 
5
  -- ### Server ###
6
  port = "8080" -- port on which server should be launched
7
  binding_ip = "127.0.0.1" --ip address on the which server should be launched.
8
+ production_use = false -- whether to use production mode or not (in other words this option should be used if it is to be used to host it on the server to provide a service to a large number of users (more than one))
9
  -- if production_use is set to true
10
  -- There will be a random delay before sending the request to the search engines, this is to prevent DDoSing the upstream search engines from a large number of simultaneous requests.
11
 
 
26
 
27
  -- ### Caching ###
28
  redis_url = "redis://127.0.0.1:8082" -- redis connection url address on which the client should connect on.
29
+
30
+ -- ### Search Engines ###
31
+ upstream_search_engines = { DuckDuckGo = true, Searx = false } -- select the upstream search engines from which the results should be fetched.