/* $Id: url.c,v 1.1.1.1 2001/04/23 14:26:40 ossi Exp $ *
 *
 * puf 0.9  Copyright (C) 2000,2001 by Oswald Buddenhagen <puf@ossi.cjb.net>
 * based on puf 0.1.x (C) 1999,2000 by Anders Gavare <gavare@hotmail.com>
 *
 * You may modify and distribute this code under the terms of the GPL.
 * The is NO WARRANTY of any kind. See COPYING for details.
 *
 * url.c - parse and manage urls
 *
 */

#include "puf.h"


int max_depth = -1;
int inhibit_cgiget = 0;
int economize_dns = 0;
url_t *urllist = NULL;
proxy_t *proxylist = NULL;

/*  calculate hash code for given url. eliminate duplicates  */
int find_url(char *path, int len, hinfo_t *hinfo, int *hashp)
{
    url_t *u;
    int hash;

    hash = calc_hash(path, len) ^ (int)hinfo;
    for (u = urllist; u; u = u->next) {
	if (hash == u->url_hash && u->host->info == hinfo
	    && !memcmp(u->local_part, path, len)
	    && u->local_part[len] == '\0') {
	    dbg(URL, ("http://%s/%.*s already in chain, not adding\n",
		 hinfo->name, len, path));
	    return 1;
	}
    }
    *hashp = hash;
    return 0;
}


/*  Return 1 if url and referer are in the same directory, otherwise 0  */
int same_dir(char *path, int len, url_t *referer)
{
    int d, p = referer->disp_pathoff;
    char *lp = referer->local_part;

    if (referer->recurse_type >= HOST_RECURSIVE)
	return 1;

    /*  check, if in same top-level disposition directory as referer  */
    if (p != -1) {
	if (len < p)
	    goto notsub;
	while (lp[p] != '/')
	    p++;
	if (len < p)
	    goto notsub;
	if (memcmp(path, lp, p))
	    goto notsub;
	if (len > p && path[p] != '/')
	    goto notsub;
    }

    /*  now check, if max_depth reached  */
    if (max_depth >= 0)
	for (p++, d = 0; p < len; p++)
	    if (path[p] == '/')
		if (++d > max_depth) {
		    dbg(URL, ("not added '/%.*s' (directories to deeply nested)\n",
			 len, path));
		    return 0;
		}

    return 1;

  notsub:
    dbg(URL, ("not added '/%.*s' (different top-dir)\n", len, path));
    return 0;
}


#define PR_NOURL -2
#define PR_BAD -1
#define PR_UNK 0
#define PR_HTTP 1

/*  parse the "remote" part of a url. return protocol  */
int parse_url(char *url, int len, int guess_proto, int *pp, 
	      char **auth, int *auth_len, 
	      u_short *port, char *hostbuf, int *hnlp)
{
    int p, po, ho, noho, hnl;
/*    int proto;*/

    checken("parse_url (top)");
    /*  get protocol  */
    for (p = 0; ; p++) {
	if (p >= len) {
	  brken:		/*  no protocol. impossible in good url.  */
	    if (guess_proto) {
	      rhttp:
/*		proto = PR_HTTP;*/
		p = 0;
		break;
	    }
	    return PR_NOURL;
	}
	if (url[p] == ':') {
	    noho = len < p + 2 || url[p + 1] != '/' || url[p + 2] != '/';
	    if (noho && guess_proto)
		goto rhttp;
	    if (len < 7 || strncasecmp(url, "http:", 5)) {
		*pp = p;	/*  for message  */
		return PR_UNK;
	    }
/*	    proto = PR_HTTP;*/
	    if (noho)
		return PR_BAD;	/*  no host.  */
	    p += 3;
	    break;
	}
	if (!isalpha((int)url[p]))
	    goto brken;
    }

    /*  get host & port  */
    *auth = 0;
    *auth_len = 0;
  reho:
    ho = p;
    po = 0;
    for (; p < len && url[p] != '/'; p++) {
	if (url[p] == ':')
	    po = p;
	else if (url[p] == '@') {
	    *auth = url + ho;
	    *auth_len = p - ho;
	    p++;
	    goto reho;
	}
    }
    if (p == ho)
	return PR_BAD;		/*  empty host.  */

    if (po) {
	if (!(*port = atoi(url + po + 1)))
	    return PR_BAD;		/*  invalid port.  */
    } else {
	*port = 80;
	po = p;
    }

    for (hnl = 0; ho < po && hnl < SHORTSTR - 1; ho++, hnl++)
	hostbuf[hnl] = tolower((int)url[ho]);
    hostbuf[hnl] = '\0';

    *hnlp = hnl;
    *pp = p;
/*    return proto;*/
    checken("parse_url (end)");
    return PR_HTTP;
}


/*  parse proxy url, return proxy structure  */
proxy_t *parse_proxy(char *proxy, int ratio)
{
    char hostbuf[SHORTSTR], *auth, *pt;
    int hnl, auth_len, p, lp, len;
    u_short port;
    proxy_t *prox;
    host_t *host;

    checken("parse_proxy (top)");
    dbg(URL, ("parse_proxy '%s', ratio %d\n", proxy, ratio));
    len = strlen(proxy);
    switch (parse_url(proxy, len, 1, &p, &auth, &auth_len, &port, 
		      hostbuf, &hnl))
    {
	case PR_UNK:
	    prx(ERR, "unsupported proxy protocol %.*s\n", p, proxy);
	case PR_BAD:
	    return 0;
    }

    if ((host = host_lookup_fast(hostbuf, hnl)) != NULL) {
	if (!host->info)
	    return 0;
	for (prox = proxylist; prox; prox = prox->next)
	    if (prox->host == host)
		return prox;
    }

    /*  get path in local part  */
    if (p < len)
	lp = p + 1;
    else
	lp = len;

    if (!(prox = mmalloc(sizeof(*prox) + (len - lp + 1) + 
			 (auth ? len_enc_auth(auth_len) : 0))))
	return 0;

    if (!host) {
	if (!(host = host_lookup_full(hostbuf, hnl, 0, prox))) {
	    free(prox);
	    return 0;
	}
	waiting_proxies++;
	prox->ready = 0;
    } else
	prox->ready = 1;

    memcpy(prox->cgi_path, proxy + lp, len - lp);
    pt = prox->cgi_path + len - lp;
    *pt++ = '\0';
    dbg(URL, (" cgi_path '%s'\n", prox->cgi_path));

    if (auth) {
	encode_auth(pt, auth, auth_len);
	prox->have_auth = 1;
	dbg(URL, (" has auth.\n"));
    } else
	prox->have_auth = 0;

    prox->host = host;
    prox->port = port;
/*    prox->cur_conn = 0;*/
    prox->score = 0;
    prox->ratio = ratio;

    prox->next = proxylist;
    proxylist = prox;

    checken("parse_proxy (end)");
    return prox;
}


/*  parse the complete url string  */
/*  a url_t structure is returned, which should be freed, 
    if it is not actually queued for download  */
int parse_add_url(char *url, int len, char *disposition, 
		  proxy_t *proxy, int strictproxy,
		  url_t *referer, int istopdir, int relocs, int recurse)
{
    char hostbuf[SHORTSTR], *auth, *pt;
    int displen, hash, hnl, auth_len, p, lp, dp, fp;
    u_short port;
    url_t *u;
    host_t *host;

    checken("parse_add_url (top)");
    switch (parse_url(url, len, istopdir > 1, &p, &auth, &auth_len, &port, 
	    hostbuf, &hnl)) {
	case PR_NOURL:
	    dbg(URL, ("'%.*s' is no URL\n", len, url));
	    return -1;
	case PR_UNK:
	    prx(WRN, "unsupported protocol %.*s\n", p, url);
	case PR_BAD:
	    return 0;
    }

    if (!(host = host_lookup_fast(hostbuf, hnl))) {
	if (!istopdir && economize_dns) {
	    prx(WRN, "not adding '%.*s' (non-cached hostname)\n", len,
		url);
	    return 0;
	}
    } else {
	if (!host->info) {
	    /*  prx(ERR, "non-existent host in '%.*s'\n", len, url);  */
	    num_urls++;
	    num_urls_fail++;
	    return 0;
	}
	if (referer && referer->recurse_type <= HOST_RECURSIVE
	    && host->info != referer->host->info) {
	    dbg(URL, ("not adding '%.*s' (different host)\n", len, url));
	    return 0;
	}
    }

    /*  get path in local part  */
    dp = -1;
    if (p < len) {
	lp = p + 1;
      repath:
	fp = ++p;
	if (p + 2 <= len && url[p] == '.' && url[p + 1] == '.' &&
	    (p + 2 == len || url[p + 2] == '/')) {
	    prx(WRN, "'..' in URL? We hit an evil site ...\n");
	    return 0;
	}
	for (; p < len; p++) {
	    if (url[p] == '/') {
		dp = fp - lp;
		goto repath;
	    }
	    if (url[p] == '?') {
		if (istopdir <= inhibit_cgiget) {
		    dbg(URL, ("not adding ?-URL '%.*s'\n", len, url));
		    return 0;
		}
		break;
	    }
	}
    } else
	lp = fp = len;

    if (host && find_url(url + lp, len - lp, host->info, &hash))
	return 0;

    if (referer && !same_dir(url + lp, len - lp, referer))
	return 0;

    displen = disposition ? strlen(disposition) + 1 : 0;

    if (!(u = mmalloc(sizeof(*u) + 
		      (len - lp + 1) + 
		      displen +
		      (proxy ? sizeof(proxy_t *) : 0) +
		      (auth ? len_enc_auth(auth_len) : 0)
		     )))
	return 0;

    if (!host && !(host = host_lookup_full(hostbuf, hnl, u, 0))) {
	free(u);
	return 0;
    }

    memcpy(u->local_part, url + lp, len - lp);
    pt = u->local_part + len - lp;
    *pt++ = '\0';

    if (displen) {
	memcpy(pt, disposition, displen);
	pt += displen;
    }

    if (proxy) {
	*(proxy_t **)pt = proxy;
	pt += sizeof(proxy_t *);
	u->haveproxy = 1;
    } else
	u->haveproxy = 0;

    if (auth) {
	u->http_auth = pt;
	encode_auth(pt, auth, auth_len);
#ifdef USE_MAGIC
	pt+=len_enc_auth(auth_len);
#endif
    } else if (referer)
	u->http_auth = referer->http_auth;
    else
	u->http_auth = 0;

#ifdef USE_MAGIC
    u->len = (int)(pt - (char *)&(u->len));
#endif

    u->url_hash = hash;
    u->referer = referer;
    u->host = host;
    u->port = port;
    u->path_len = fp - lp;
    u->recurse_type = recurse;
    u->is_top_dir = istopdir != 0;
    u->strictproxy = strictproxy != 0;
    u->relocs = relocs;
    u->havedisp = displen ? 1 : 0;
    u->disp_pathoff = istopdir ? dp : referer->disp_pathoff;

    checken("parse_url (pre-end)");

    dbg(URL, ("'%.*s' => '%.*s' @ %s : %i / '%.*s' '%.*s' %i\n", len, url,
	 auth_len, auth ? auth : "", host->name, port,
	 fp - lp, url + lp, len - fp, url + fp, dp));

    if (host->info)
	add_url(u);

    return 1;
}

int queue_url(url_t *u)
{
    wurl_t *wu;

    if (!(wu = mmalloc(sizeof(*wu))))
	return 0;
    wu->url = u;
    cq_append(queue_urls_connect, wu);
    return 1;
}

/*  add a url to the url chain and enqueue for processing  */
void add_url(url_t *u)
{
    checken("add_url (top)");
    if (queue_url(u))
	do_add_url(u);
    else
	free(u);
    checken("add_url (end)");
}

void do_add_url(url_t *u)
{
    u->attempt = 0;

    u->next = urllist;
    urllist = u;
    num_urls++;

#ifdef USE_MAGIC
    u->chk = calc_hash((char *)&(u->len), u->len);
#endif

    dbg(URL, ("added http://%s/%s\n", u->host->name, u->local_part));
}

/*  return preferred proxy if any - this is ugly ...  */
proxy_t *xtr_proxy(url_t *u)
{
    if (u->haveproxy) {
	char *disp = u->local_part + u->path_len;
	disp += strlen(disp) + 1;

	if (u->havedisp)
	    disp += strlen(disp) + 1;

	return *(proxy_t **)disp;
    } else
	return 0;
}
