-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathnew.c
195 lines (165 loc) · 4.77 KB
/
new.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <curl/curl.h>
#include <libxml/HTMLparser.h>
#include <libxml/xpath.h>
#include <stdbool.h>
#include <regex.h>
#include <unistd.h>
struct CURLResponse
{
char *html;
size_t size;
};
struct QNode
{
char *url;
struct QNode *next;
};
struct Queue
{
struct QNode *front, *rear;
};
struct Queue *createQueue();
void enQueue(struct Queue *q, int k);
void deQueue(struct Queue *q);
void printQueue(struct Queue *queue);
bool relativelink(char *url)
{
regex_t regex;
int value;
// Comparing pattern "https://" with
value = regcomp(®ex, "https://", 0);
value = regexec(®ex, url, 0, NULL, 0);
if (value == 0)
{
return true;
}
else
{
return false;
}
}
static size_t WriteHTMLCallback(void *contents, size_t size, size_t nmemb, void *userp)
{
size_t realsize = size * nmemb;
struct CURLResponse *mem = (struct CURLResponse *)userp;
char *ptr = realloc(mem->html, mem->size + realsize + 1);
if (!ptr)
{
printf("Not enough memory available (realloc returned NULL)\n");
return 0;
}
mem->html = ptr;
memcpy(&(mem->html[mem->size]), contents, realsize);
mem->size += realsize;
mem->html[mem->size] = 0;
return realsize;
}
struct CURLResponse GetRequest(CURL *curl_handle, const char *url)
{
CURLcode res;
struct CURLResponse response;
// initialize the response
response.html = malloc(1);
response.size = 0;
// specify URL to GET
curl_easy_setopt(curl_handle, CURLOPT_URL, url);
// send all data returned by the server to WriteHTMLCallback
curl_easy_setopt(curl_handle, CURLOPT_WRITEFUNCTION, WriteHTMLCallback);
// pass "response" to the callback function
curl_easy_setopt(curl_handle, CURLOPT_WRITEDATA, (void *)&response);
// set a User-Agent header
curl_easy_setopt(curl_handle, CURLOPT_USERAGENT, "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36");
// perform the GET request
res = curl_easy_perform(curl_handle);
// check for HTTP errors
if (res != CURLE_OK)
{
fprintf(stderr, "GET request failed: %s\n", curl_easy_strerror(res));
}
return response;
}
// ...
// struct CURLResponse GetRequest(CURL *curl_handle, const char *url) ...
int extract_links(struct QNode *q)
{
// initialize curl globally
curl_global_init(CURL_GLOBAL_ALL);
// initialize a CURL instance
CURL *curl_handle = curl_easy_init();
// retrieve the HTML document of the target page
struct CURLResponse response = GetRequest(curl_handle, "https://nodejs.org/docs/latest/api/");
// print the HTML content
// printf("%s\n", response.html);
// scraping logic...
FILE *fptr;
// Open a file in writing mode
fptr = fopen("new.html", "w");
// Write some text to the file
fprintf(fptr, response.html);
// Close the file
fclose(fptr);
// Create Xml Object from HTML
htmlDocPtr doc = htmlReadMemory(response.html, (unsigned long)response.size, NULL, NULL, HTML_PARSE_NOERROR);
// Scrape AnchorTag elements using XML
xmlXPathContextPtr context = xmlXPathNewContext(doc);
xmlXPathObjectPtr productHTMLElements = xmlXPathEvalExpression((xmlChar *)"//a", context);
// Save href LInks to a file
for (int i = 0; i < productHTMLElements->nodesetval->nodeNr; ++i)
{
xmlNodePtr urlHTMLElement = productHTMLElements->nodesetval->nodeTab[i];
char *url = (char *)xmlGetProp(urlHTMLElement, (xmlChar *)"href");
// strncat(url, '\0', 1);
// printf("%d\n",sizeof(url));
// int l = 0;
// while(*(url + i) != '\0'){
// printf("%c",*(url+i));
// i++;
// }
// sleep(10);
// Enqueue url in explore queue
FILE *fptr;
// Open a file in writing mode
fptr = fopen("links.txt", "a+");
// Write some text to the fileS
if (relativelink(url))
{
fprintf(fptr, "%s\n", url);
enQueue(q, url);
}
else
{
fprintf(fptr, "https://nodejs.org/docs/latest/api/%s\n", url);
enQueue(q, url);
}
free(url);
// Close the file
fclose(fptr);
}
free(response.html);
xmlXPathFreeContext(context);
xmlFreeDoc(doc);
xmlCleanupParser();
// cleanup the curl instance
curl_easy_cleanup(curl_handle);
// cleanup the curl resources
curl_global_cleanup();
return 0;
}
int main()
{
struct Queue *q = createQueue();
extract_links(q);
struct QNode *temp = q->front;
int i = 0;
while (temp)
{
i++;
temp = temp->next;
}
printQueue(q);
printf("%d\n", i);
return 0;
}