Correction:
/* remove HTTP headers from multiple gzip or single zip from stdin */
int fileno (FILE *);
int setenv (const char *, const char *, int);
#define jmp (yy_start) = 1 + 2 *
int x;
%option nounput noinput noyywrap
%%
HTTP\/[\40-\176]+\x0d\x0a x++;
[\40-\176]+:[\40-\176]+\r\n if(!x)fwrite(yytext,1,yyleng,yyout);
\x0D\x0A if(!x)fwrite(yytext,1,yyleng,yyout);x=0;
%%
int main()
{
yylex();
exit(0);
}
Usage example:
Retrieve hostnames, IP addresses and (if available) sitemap URLs from latest Common Crawl.
ftp -4 https://data.commoncrawl.org/crawl-data/CC-MAIN-2023-50/robotstxt.paths.gz # <-- 180K
gzip -dc robotstxt.paths.gz \
|head -5 \
|sed 's>.*>GET /& HTTP/1.1[]Host: data.commoncrawl.org[]Connection: >;
$!s/$/keep-alive[]/;$s/$/close[]/' \
|tr [] '\r\n' \
|openssl s_client -quiet -connect data.commoncrawl.org:443 \
|yy054 \
|zegrep -a '(^Sitemap:)|(^Host:)|(^WARC-Target-URI:)|(^WARC-IP-Address:)' > 1.txt
exec cat 1.txt