| Hash | Commit message | Author | Date | Files | + | - |
1 | commit c6371c5f0f9186fa9b332a1ce546ebb6d71b5790 |
2 | Author: Connor Etherington <[email protected]> |
3 | Date: Fri Sep 13 21:27:24 2024 +0200 |
4 | |
5 | Update 13-September-2024___21:27:21 |
6 | --- |
7 | README.md | 211 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ |
8 | app.js | 56 ++++++++++++++-- |
9 | package.json | 12 +++- |
10 | 3 files changed, 270 insertions(+), 9 deletions(-) |
11 | |
12 | diff --git a/README.md b/README.md |
13 | new file mode 100644 |
14 | index 0000000..ddd1ac3 |
15 | --- /dev/null |
16 | +++ b/README.md |
17 | @@ -0,0 +1,211 @@ |
18 | +# @agentics/get |
19 | + |
20 | +A versatile command-line tool and library for making HTTP requests, scraping web content, and automating web interactions using Node.js. |
21 | + |
22 | +## Table of Contents |
23 | + |
24 | +- [Features](#features) |
25 | +- [Installation](#installation) |
26 | +- [Usage](#usage) |
27 | + - [Command-Line Usage](#command-line-usage) |
28 | + - [Options](#options) |
29 | + - [Examples](#examples) |
30 | +- [Programmatic Usage](#programmatic-usage) |
31 | +- [Contributing](#contributing) |
32 | +- [License](#license) |
33 | + |
34 | +## Features |
35 | + |
36 | +- Perform HTTP GET and POST requests. |
37 | +- Scrape web pages using selectors. |
38 | +- Extract text, HTML, links, images, and cookies. |
39 | +- Evaluate custom JavaScript on web pages. |
40 | +- Save responses and cookies to files. |
41 | +- Use customizable request headers. |
42 | +- Supports JavaScript evaluation with Puppeteer. |
43 | + |
44 | +## Installation |
45 | + |
46 | +```bash |
47 | +npm install -g @agentics/get |
48 | +``` |
49 | + |
50 | +## Usage |
51 | + |
52 | +### Command-Line Usage |
53 | + |
54 | +```bash |
55 | +get [options] |
56 | +``` |
57 | + |
58 | +### Options |
59 | + |
60 | +| Option | Alias | Type | Description | Default | |
61 | +|------------------------|-------|---------|-------------------------------------------------------------------------------------------------------|---------| |
62 | +| `--get` | `-g` | Boolean | Perform a GET request (default). | `true` | |
63 | +| `--post` | `-p` | Boolean | Perform a POST request. | `false` | |
64 | +| `--url <url>` | `-u` | String | URL to request. **(Required)** | | |
65 | +| `--eval <script>` | `-e` | String | Evaluate JavaScript on the page. | | |
66 | +| `--save` | `-s` | Boolean | Save response and cookies to files (`response.json`, `cookies.json`). | `false` | |
67 | +| `--selector <css>` | `-S` | String | CSS selector to scrape content from the page. | | |
68 | +| `--links` | `-l` | Boolean | Return all links (`<a href="">`) from the page. | `false` | |
69 | +| `--text` | `-t` | Boolean | Return the text content of the page (default if no other output option is specified). | `false` | |
70 | +| `--html` | `-H` | Boolean | Return the HTML content of the page. | `false` | |
71 | +| `--cookies` | `-c` | Boolean | Return cookies from the response headers. | `false` | |
72 | +| `--header <key:value>` | `-h` | String | Custom headers for the request (can be used multiple times). | | |
73 | + |
74 | +### Examples |
75 | + |
76 | +#### Basic GET Request |
77 | + |
78 | +```bash |
79 | +get -u https://example.com |
80 | +``` |
81 | + |
82 | +#### POST Request with Custom Headers |
83 | + |
84 | +```bash |
85 | +get -p -u https://example.com/api -h "Content-Type: application/json" -h "Authorization: Bearer token" |
86 | +``` |
87 | + |
88 | +#### Scrape Text Using a CSS Selector |
89 | + |
90 | +```bash |
91 | +get -u https://example.com -S ".article-title" |
92 | +``` |
93 | + |
94 | +#### Evaluate JavaScript on the Page |
95 | + |
96 | +```bash |
97 | +get -u https://example.com -e "document.title" |
98 | +``` |
99 | + |
100 | +#### Get All Links from a Page |
101 | + |
102 | +```bash |
103 | +get -u https://example.com -l |
104 | +``` |
105 | + |
106 | +#### Save Response and Cookies to Files |
107 | + |
108 | +```bash |
109 | +get -u https://example.com -s |
110 | +``` |
111 | + |
112 | +#### Get Page HTML |
113 | + |
114 | +```bash |
115 | +get -u https://example.com -H |
116 | +``` |
117 | + |
118 | +#### Get Cookies from Response Headers |
119 | + |
120 | +```bash |
121 | +get -u https://example.com -c |
122 | +``` |
123 | + |
124 | +### Combining Options |
125 | + |
126 | +You can combine multiple options to perform complex tasks. For example: |
127 | + |
128 | +```bash |
129 | +get -u https://example.com -S ".article-title" -l -c -s |
130 | +``` |
131 | + |
132 | +This command will: |
133 | + |
134 | +- Scrape content matching `.article-title`. |
135 | +- Return all links from the page. |
136 | +- Return cookies from the response headers. |
137 | +- Save the response and cookies to files. |
138 | + |
139 | +## Programmatic Usage |
140 | + |
141 | +You can also use `@agentics/get` as a library in your Node.js projects. |
142 | + |
143 | +### Importing the Module |
144 | + |
145 | +```javascript |
146 | +const get = require('@agentics/get'); |
147 | +``` |
148 | + |
149 | +### Using `exportFunctions` |
150 | + |
151 | +The `exportFunctions` method allows you to perform web scraping tasks programmatically. |
152 | + |
153 | +#### Syntax |
154 | + |
155 | +```javascript |
156 | +const results = await get(url, options); |
157 | +``` |
158 | + |
159 | +#### Parameters |
160 | + |
161 | +- `url` *(String)*: The URL to request. |
162 | +- `options` *(Object, optional)*: Configuration options. |
163 | + |
164 | +#### Available Options |
165 | + |
166 | +- `post` *(Boolean)*: Use POST request instead of GET. |
167 | +- `headers` *(Object)*: Custom headers for the request. |
168 | +- `cookies` *(Boolean)*: Return cookies from the response headers. |
169 | +- `links` *(Boolean)*: Return all links from the page. |
170 | +- `html` *(Boolean)*: Return the HTML content of the page. |
171 | +- `text` *(Boolean)*: Return the text content of the page. |
172 | +- `selector` *(String)*: CSS selector to scrape content. |
173 | +- `eval` *(String)*: JavaScript code to evaluate on the page. |
174 | +- `save` *(Boolean)*: Save response and cookies to files. |
175 | + |
176 | +#### Example |
177 | + |
178 | +```javascript |
179 | +const get = require('@agentics/get'); |
180 | + |
181 | +(async () => { |
182 | + try { |
183 | + const url = 'https://example.com'; |
184 | + const options = { |
185 | + headers: { |
186 | + 'User-Agent': 'CustomUserAgent/1.0', |
187 | + }, |
188 | + text: true, |
189 | + links: true, |
190 | + cookies: true, |
191 | + }; |
192 | + |
193 | + const results = await get(url, options); |
194 | + |
195 | + console.log('Text Content:', results.text); |
196 | + console.log('Links:', results.links); |
197 | + console.log('Cookies:', results.cookies); |
198 | + } catch (error) { |
199 | + console.error('Error:', error); |
200 | + } |
201 | +})(); |
202 | +``` |
203 | + |
204 | +#### Default Options |
205 | + |
206 | +If no options are provided, the following defaults are used: |
207 | + |
208 | +```javascript |
209 | +{ |
210 | + cookies: true, |
211 | + links: true, |
212 | + html: true, |
213 | + text: true, |
214 | +} |
215 | +``` |
216 | + |
217 | +## Contributing |
218 | + |
219 | +Contributions are welcome! Please open an issue or submit a pull request on [GitLab](https://gitlab.com/a4to/get). |
220 | + |
221 | +## License |
222 | + |
223 | +This project is licensed under the [MIT License](LICENSE). |
224 | + |
225 | +--- |
226 | + |
227 | +**Author:** Connor Etherington |
228 | +**Email:** [[email protected]](mailto:[email protected]) |
229 | diff --git a/app.js b/app.js |
230 | index e16ac9a..f09578e 100755 |
231 | --- a/app.js |
232 | +++ b/app.js |
233 | @@ -123,6 +123,11 @@ class Scraper { |
234 | await this.ensureInitialized(); |
235 | return this.response; |
236 | } |
237 | + |
238 | + async cookies() { |
239 | + await this.ensureInitialized(); |
240 | + return this.response.headers['set-cookie']; |
241 | + } |
242 | } |
243 | |
244 | const argv = yargs |
245 | @@ -182,13 +187,12 @@ const argv = yargs |
246 | |
247 | const method = argv.p ? 'POST' : 'GET'; |
248 | const url = argv.u; |
249 | -const headers = argv.h |
250 | - ? argv.h.reduce((acc, header) => { |
251 | - const [key, value] = header.split(':'); |
252 | - acc[key.trim()] = value.trim(); |
253 | - return acc; |
254 | - }, {}) |
255 | - : {}; |
256 | + |
257 | +const headers = argv.h ? argv.h.reduce((acc, header) => { |
258 | + const [key, value] = header.split(':'); |
259 | + acc[key.trim()] = value.trim(); |
260 | + return acc; |
261 | +}, {}) : {}; |
262 | |
263 | const config = { method, headers, url, withCredentials: true }; |
264 | |
265 | @@ -215,6 +219,7 @@ const scraper = new Scraper(url, config); |
266 | if (argv.H) console.log(await scraper.getHtml()); |
267 | if (argv.S) console.log(await scraper.scrape(argv.S)); |
268 | if (argv.e) console.log(await scraper.evaluate(argv.e)); |
269 | + if (argv.c) console.log(await scraper.cookies()); |
270 | if (argv.save) { |
271 | const response = await scraper.getResponse(); |
272 | saveFiles(response); |
273 | @@ -224,3 +229,40 @@ const scraper = new Scraper(url, config); |
274 | } |
275 | })(); |
276 | |
277 | +const exportFunctions = async (url, options = {}) => { |
278 | + if (Object.keys(options).length === 0) { |
279 | + options = { |
280 | + cookies: true, |
281 | + links: true, |
282 | + html: true, |
283 | + text: true, |
284 | + }; |
285 | + } |
286 | + |
287 | + const scrapeOptions = { |
288 | + headers: options.headers || {}, |
289 | + method: options.post ? 'POST' : 'GET', |
290 | + }; |
291 | + |
292 | + const scraper = new Scraper(url, scrapeOptions); |
293 | + await scraper.ensureInitialized(); |
294 | + |
295 | + const results = {}; |
296 | + |
297 | + if (options.text) results.text = await scraper.pageText(); |
298 | + if (options.links) results.links = await scraper.links(); |
299 | + if (options.html) results.html = await scraper.getHtml(); |
300 | + if (options.selector) results.scraped = await scraper.scrape(options.selector); |
301 | + if (options.eval) results.evaluated = await scraper.evaluate(options.eval); |
302 | + if (options.cookies) results.cookies = await scraper.cookies(); |
303 | + |
304 | + if (options.save) { |
305 | + const response = await scraper.getResponse(); |
306 | + saveFiles(response); |
307 | + } |
308 | + |
309 | + return results; |
310 | +}; |
311 | + |
312 | +module.exports = exportFunctions; |
313 | + |
314 | diff --git a/package.json b/package.json |
315 | index a7823aa..2aa571e 100644 |
316 | --- a/package.json |
317 | +++ b/package.json |
318 | @@ -1,7 +1,8 @@ |
319 | { |
320 | - "name": "get", |
321 | + "name": "@agentics/get", |
322 | "version": "0.1.0", |
323 | "main": "app.js", |
324 | + "private": false, |
325 | "scripts": { |
326 | "start": "node ${npm_package_main}", |
327 | "dev": "nodemon ${npm_package_main}" |
328 | @@ -9,7 +10,14 @@ |
329 | "bin": { |
330 | "get": "./app.js" |
331 | }, |
332 | - "author": "Connor Etherington <[email protected]>", |
333 | + "repository": { |
334 | + "type": "git", |
335 | + "url": "git+https://gitlab.com/a4to/get.git" |
336 | + }, |
337 | + "author": { |
338 | + "name": "Connor Etherington", |
339 | + "email": "[email protected]" |
340 | + }, |
341 | "license": "MIT", |
342 | "dependencies": { |
343 | "axios": "^1.7.7", |