// ------------------------------- //
// -------- Start of File -------- //
// ------------------------------- //
// ----------------------------------------------------------- // 
// C++ Source Code File Name: httpgrab.cpp
// Compiler Used: MSVC, BCC32, GCC, HPUX aCC, SOLARIS CC
// Produced By: glNET Software
// File Creation Date: 01/25/2000
// Date Last Modified: 06/27/2001
// Copyright (c) 2001 glNET Software
// ----------------------------------------------------------- // 
// ------------- Program Description and Details ------------- // 
// ----------------------------------------------------------- // 
/*
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
 
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
Lesser General Public License for more details.

You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  
USA

The HTTPGrab class is used to grab specifed file from HTTP
servers. 
*/
// ----------------------------------------------------------- // 
#include "httpgrab.h"

#if defined(__CONSOLE__)
#include <iostream.h>
#endif

const char *hgDefaultIndexFileName = "index.html";

urlQueueNode::urlQueueNode()
{
  url_parent_directory = "\0";
}

void urlQueueNode::Copy(const urlQueueNode &ob)
{
  u = ob.u;
  url_parent_directory = ob.url_parent_directory;
}

int operator==(const urlQueueNode &a, const urlQueueNode &b) 
// Overloaded == operator added to work with gxs linked list class.
{
  if(a.u != b.u) return 0;
  if(a.url_parent_directory != b.url_parent_directory) return 0;
  return 1;
}

HTTPGrab::HTTPGrab()
{
  bytes_received = 0;
  files_to_download.ClearList();
  files_downloaded.ClearList();
  server_name = top_level_directory = url_parent_directory = "\0";
  username = password = "\0";
  work_offline = 0;
}

HTTPGrab::~HTTPGrab()
{
  files_to_download.ClearList();
  files_downloaded.ClearList();
}

int HTTPGrab::CleanTagInfo(gxString &tag, const char *attrib)
{
  unsigned offset = tag.IFind(attrib);
  if(offset == -1) return 0;

  // Delete everything before and including the attribute
  tag.DeleteAt(0, offset+strlen(attrib));
  tag.DeleteBeforeIncluding("=");

  // Trim any leading or trialing spaces
  tag.TrimLeadingSpaces();
  tag.TrimTrailingSpaces();

  // Remove any attributes after the file name 
  tag.DeleteAfterIncluding(" ");

  // Get rid of the document reference tags
  tag.DeleteAfterIncluding("#");

  // Look for a URL surrounded by quotation marks and
  // delete everything before and after the quotes.
  tag.DeleteBeforeIncluding("\"");
  tag.DeleteAfterIncluding("\"");

  // No file names following a document reference tag
  if(tag.length() == 0) return 0;
  
  return 1;
}

void HTTPGrab::Handle_A_Tag()
{
  // Look for anchor tags pointing to documents
  if(list_ptr->data.end_instruction) return;
  gxString sbuf(list_ptr->data.attr); // Current list node
  if(sbuf.length() == 0) return;
  if(!CleanTagInfo(sbuf, "HREF")) return;
  LoadURL(sbuf);
}

void HTTPGrab::Handle_AREA_Tag()
{
 // Look for image map area tags
  gxString sbuf(list_ptr->data.attr); // Current list node
  if(sbuf.length() == 0) return;
  if(!CleanTagInfo(sbuf, "HREF")) return;
  LoadURL(sbuf);
}

void HTTPGrab::Handle_FRAME_Tag() 
{
  // Look for html files in frame tags
  gxString sbuf(list_ptr->data.attr); // Current list node
  if(sbuf.length() == 0) return;
  if(!CleanTagInfo(sbuf, "SRC")) return;
  LoadURL(sbuf);
}

void HTTPGrab::Handle_BODY_Tag()
{
  // Look for background image files
  gxString sbuf(list_ptr->data.attr); // Current list node
  if(sbuf.length() == 0) return;
  if(!CleanTagInfo(sbuf, "BACKGROUND")) return;
  LoadURL(sbuf);
}

void HTTPGrab::Handle_IMG_Tag()
{
  // Look for Images files
  gxString sbuf(list_ptr->data.attr); // Current list node
  if(sbuf.length() == 0) return;
  if(!CleanTagInfo(sbuf, "SRC")) return;
  LoadURL(sbuf);
}

int HTTPGrab::LoadURL(const gxString &attr)
{
  gxsURLInfo u;
  gxString sbuf(attr);

  int url_is_complete = 0;
  
  int rv = url.HasProtocol(sbuf);
  if(rv != gxsURL::gxs_Unknown_URL_protocol) {

    // Add all protocols to exclude here
    if(rv == gxsURL::gxs_mailto) return 0;

    // Check for a host name if a protocol was specified
    gxString hbuf;
    if(url.ParseHostName(sbuf, hbuf)) {
      if(!url.ParseURL(sbuf, u)) return 1; 
      url_is_complete = 1;
    }
    else { // Remove the protocol
      sbuf.DeleteBeforeIncluding(":");
      sbuf.TrimLeadingSpaces();
    }
  }

  // Assign a default protocol and port
  if(!url_is_complete) {
    u.port = gxSOCKET_HTTP_PORT;
    u.proto_type = gxsURL::gxs_http; 
    u.path = sbuf;
    u.host = server_name;
  }

  int is_absolute = 0; // True if this an absolute path

  if(u.path[0] != '/') { // This is a relative path
    u.path.InsertAt(0, "/");
    is_absolute = 0;
  }
  else
    is_absolute = 1; // This is an absolute path
  
  // Make the relative path an absolute path
  if(!is_absolute) {
    if(url_parent_directory != "/")
      u.path.InsertAt(0, url_parent_directory);
  }

  // Account for paths in the root directory that do not
  // specify the root directory as a parent.
  if(u.path[0] != '/') u.path.InsertAt(0, "/");

  // Collapse all ../ and ./ path separators
  char new_path[df_MAX_DIR_LENGTH]; int end_path = 0;
  if(u.path[u.path.length()-1] == '/') end_path = 1;
  df_PathSimplify(u.path.c_str(), new_path);
  if(StringCompare(u.path, new_path) != 0) u.path = new_path;
  if(end_path) u.path += "/"; // Put back the path terminator
  
  // Fill in the path, directory, and file name information
  url.ParseDirectory(u);

  // Get the this URL's parent directory
  gxString pbuf(u.dir);
  if(pbuf[pbuf.length()-1] == '/') pbuf.DeleteAfterLastIncluding("/");
  
  urlQueueNode n;
  n.u = u;
  n.url_parent_directory = pbuf;
  sbuf = n.u.path;
  
  if((!n.u.user) || (!n.u.passwd)) {
    // Set the username and password that will be used
    // for every document in this realm
    n.u.user = username;
    n.u.passwd = password;
  }

  unsigned offset = server_name.Find(u.host);
  if(offset != -1) {
    // Do not download files above the top level directory
    if(StringCompare(top_level_directory, "/") == 0) {
	// We are starting in the root directory so download everything
	files_to_download.Insert(n);
      }
      else {
	offset = u.path.Find(top_level_directory);
	if(offset != -1) files_to_download.Insert(n);
      }
    }

  return 0;
}

int HTTPGrab::GrabHTMLFile(gxsURLInfo &u)
{
  gxsHTTPClient client;
  gxsHTTPHeader hdr;
  bytes_received = 0;

  // Working offline with previously downloaded websites
  if(work_offline) return 0;

  int rv = client.RequestHeader(u, hdr);

  switch(hdr.http_status) {
    case gxsHTTP_STATUS_NO_CONTENT :
      return gxsHTTP_STATUS_NO_CONTENT;

    case gxsHTTP_STATUS_MOVED_PERMANENTLY : { 
      // Load the document and get the URLS
      LoadURL(hdr.location);
      return gxsHTTP_STATUS_MOVED_PERMANENTLY;
    }

    case gxsHTTP_STATUS_MOVED_TEMPORARILY : { 
      // Load the document and get the URLS
      LoadURL(hdr.location);
      return gxsHTTP_STATUS_MOVED_TEMPORARILY;
    }

    case gxsHTTP_STATUS_NOT_MODIFIED :      
      return gxsHTTP_STATUS_NOT_MODIFIED;
      
    case gxsHTTP_STATUS_BAD_REQUEST :       
      return gxsHTTP_STATUS_BAD_REQUEST;

    case gxsHTTP_STATUS_FORBIDDEN :         
      return gxsHTTP_STATUS_FORBIDDEN;
      
    case gxsHTTP_STATUS_NOT_FOUND :         
      return gxsHTTP_STATUS_NOT_FOUND;
      
    case gxsHTTP_STATUS_INTERNAL :          
      return gxsHTTP_STATUS_INTERNAL;

    case gxsHTTP_STATUS_NOT_IMPLEMENTED :   
      return gxsHTTP_STATUS_NOT_IMPLEMENTED;

    case gxsHTTP_STATUS_BAD_GATEWAY :       
      return gxsHTTP_STATUS_BAD_GATEWAY;

    default:
      // Copy the server's reply info to the file requested
      break;
  }

  // Construct the local file name based on the URL information
  gxString sbuf(u.dir);
  if(u.file == "?") { // Is this a file or directory
    while(sbuf.DeleteBeforeIncluding("/")) ;
    u.file = sbuf;
    u.dir.DeleteAfterIncluding(sbuf.c_str());
  }
  if(!u.file) {
    u.file = GetDefaultIndexFileName();
  }

  // Replace any unsafe characters from the local file name
  gxString fbuf(u.file);
#if defined (__DOS__) || defined (__WIN32__)
  fbuf.FilterString("?");
#endif
  
  sbuf = u.host;
  sbuf += u.dir;
  if(sbuf[sbuf.length()-1] != '/') sbuf += "/";
  sbuf += fbuf;
  u.local_file = sbuf;
    
  // Create a directory to download the files in
  rv = df_mkdir(u.host.c_str());
  if(rv) return rv; // An error occurred

  // Create the subdirectories  
  sbuf = u.host;
  sbuf += u.dir;

  rv = df_mkdir(sbuf.c_str());
  if(rv) return rv;

  // Fix up the local file name
  sbuf = u.local_file;
#if defined(__DOS__) || defined(__WIN32__)
  df_MakeDOSPath(sbuf.c_str());
#endif

  hgPrintConnectionMessage1(u);
  
  rv = df_Open(sbuf.c_str(),
	       DiskFileB::df_READWRITE, DiskFileB::df_CREATE, df_TRUNCATE);
  if(rv) return rv;

  rv = client.RequestFile(u, hdr, df_fptr);
  if(rv) return rv;
  client.Flush();

  df_Close();
  hgPrintDownloadCompleteMessage();

  return 0; // No errors were encountered
}

int HTTPGrab::SetWebAddress(const gxString &web_url)
{
  gxsURLInfo u;
  gxString index_file_name;

  if(!url.ParseURL(web_url, u)) return 1;

  server_name = u.host;
  if(u.dir.length() > 0)
    top_level_directory = u.dir;
  else
    top_level_directory = "/";

  if(u.user.length() > 0 && u.passwd.length() > 0) {
    // Set the username and password that will be used
    // for every document in this realm
    username = u.user;
    password = u.passwd;
  }
  
  urlQueueNode n;
  n.u = u;
  n.url_parent_directory = top_level_directory;

  LoadURL(u.path);
  return 0; // No errors were encountered
}

char *HTTPGrab::GetDefaultIndexFileName()
{
  return (char *)hgDefaultIndexFileName;
}

const char *HTTPGrab::GetDefaultIndexFileName() const
{
  return hgDefaultIndexFileName;
}

int HTTPGrab::GrabHTMLFile(gxsURLInfo &u, fstream *outfile)
{
  bytes_received = 0;

  // Working offline with previously downloaded websites
  if(work_offline) return 0;

  int rv = client.RequestHeader(u, hdr);

  switch(hdr.http_status) {
    case gxsHTTP_STATUS_NO_CONTENT :
      return gxsHTTP_STATUS_NO_CONTENT;

    case gxsHTTP_STATUS_MOVED_PERMANENTLY : { 
      // Load the document and get the URLS
      LoadURL(hdr.location);
      return gxsHTTP_STATUS_MOVED_PERMANENTLY;
    }

    case gxsHTTP_STATUS_MOVED_TEMPORARILY : { 
      // Load the document and get the URLS
      LoadURL(hdr.location);
      return gxsHTTP_STATUS_MOVED_TEMPORARILY;
    }

    case gxsHTTP_STATUS_NOT_MODIFIED :      
      return gxsHTTP_STATUS_NOT_MODIFIED;
      
    case gxsHTTP_STATUS_BAD_REQUEST :       
      return gxsHTTP_STATUS_BAD_REQUEST;

    case gxsHTTP_STATUS_FORBIDDEN :         
      return gxsHTTP_STATUS_FORBIDDEN;
      
    case gxsHTTP_STATUS_NOT_FOUND :         
      return gxsHTTP_STATUS_NOT_FOUND;
      
    case gxsHTTP_STATUS_INTERNAL :          
      return gxsHTTP_STATUS_INTERNAL;

    case gxsHTTP_STATUS_NOT_IMPLEMENTED :   
      return gxsHTTP_STATUS_NOT_IMPLEMENTED;

    case gxsHTTP_STATUS_BAD_GATEWAY :       
      return gxsHTTP_STATUS_BAD_GATEWAY;

    default:
      // Copy the server's reply info to the file requested
      break;
  }

  hgPrintConnectionMessage2(u);

  rv = client.RequestFile(u, hdr, outfile);
  if(rv) return rv;
  client.Flush();

  hgPrintDownloadCompleteMessage();
  return 0; // No errors were encountered
}

int HTTPGrab::GrabHTMLFile(gxsURLInfo &u, MemoryBuffer &mbuf)
{
  bytes_received = 0;

  // Working offline with previously downloaded websites
  if(work_offline) return 0;

  int rv = client.RequestHeader(u, hdr);

  switch(hdr.http_status) {
    case gxsHTTP_STATUS_NO_CONTENT :
      return gxsHTTP_STATUS_NO_CONTENT;

    case gxsHTTP_STATUS_MOVED_PERMANENTLY : { 
      // Load the document and get the URLS
      LoadURL(hdr.location);
      return gxsHTTP_STATUS_MOVED_PERMANENTLY;
    }

    case gxsHTTP_STATUS_MOVED_TEMPORARILY : { 
      // Load the document and get the URLS
      LoadURL(hdr.location);
      return gxsHTTP_STATUS_MOVED_TEMPORARILY;
    }

    case gxsHTTP_STATUS_NOT_MODIFIED :      
      return gxsHTTP_STATUS_NOT_MODIFIED;
      
    case gxsHTTP_STATUS_BAD_REQUEST :       
      return gxsHTTP_STATUS_BAD_REQUEST;

    case gxsHTTP_STATUS_FORBIDDEN :         
      return gxsHTTP_STATUS_FORBIDDEN;
      
    case gxsHTTP_STATUS_NOT_FOUND :         
      return gxsHTTP_STATUS_NOT_FOUND;
      
    case gxsHTTP_STATUS_INTERNAL :          
      return gxsHTTP_STATUS_INTERNAL;

    case gxsHTTP_STATUS_NOT_IMPLEMENTED :   
      return gxsHTTP_STATUS_NOT_IMPLEMENTED;

    case gxsHTTP_STATUS_BAD_GATEWAY :       
      return gxsHTTP_STATUS_BAD_GATEWAY;

    default:
      // Copy the server's reply info to the file requested
      break;
  }

  hgPrintConnectionMessage2(u);

  rv = client.RequestFile(u, hdr, mbuf);
  if(rv) return rv;

  hgPrintDownloadCompleteMessage();
  return 0; // No errors were encountered
}

void hgPrintNumBytesReceived(int bytes)
{
#if defined(__CONSOLE__)
  cout << "Received " << bytes << " bytes..." << endl;
#endif
}

void hgPrintDownloadCompleteMessage()
{
#if defined(__CONSOLE__)
  cout << "Download complete." << endl;
#endif
}

void hgPrintConnectionMessage1(const gxsURLInfo &u)
{
#if defined(__CONSOLE__)
  cout << endl;
  cout << "Connecting to: " << u.host.c_str() << endl;
  cout << "Grabbing file: " << u.path.c_str() << endl;
  cout << "Writing output to: " << u.local_file.c_str() << endl;
#endif
}

void hgPrintConnectionMessage2(const gxsURLInfo &u)
{
#if defined(__CONSOLE__)
  cout << endl;
  cout << "Connecting to: " << u.host.c_str() << endl;
  cout << "Grabbing file: " << u.path.c_str() << endl;
#endif
}

void hgPringHTTPHeader(const gxsHTTPHeader &hdr)
{
#if defined(__CONSOLE__)
  cout << endl;  
  cout << "<------ Document Header ------>" << endl;
  cout << hdr.http_header;
  cout << "<----------------------------->" << endl;
  cout << endl;
  cout << "Press Enter to continue..." << endl;
  cin.get();
  
  cout << "Processing the header information..." << endl;
  cout << gxsHTTPStatusCodeMessage(hdr.http_status) << endl;

  cout.setf(ios::showpoint | ios::fixed);
  cout.precision(1);
  cout << "HTTP version: " << hdr.http_version << endl;

  cout << "Document status code: " << hdr.http_status << endl;
  if(hdr.current_server.length() > 0) {
    cout << "Current Server: " << hdr.current_server.c_str() << endl;
  }
  if(hdr.location.length() > 0) {
    cout << "Location: " << hdr.location.c_str() << endl;
  }
  if(hdr.http_last_modified.length() > 0) {
    cout << "Date Document Last Modified: " << hdr.http_last_modified.c_str()
	 << endl;
  }
  if(hdr.date.length() > 0) {
    cout << "Date: " << hdr.date.c_str() << endl;
  }
  if(hdr.http_expires.length() > 0) {
    cout << "Expires: " << hdr.http_expires.c_str() << endl;
  }
  if(hdr.etag.length() > 0) {
    cout << "Entity tag: " << hdr.etag.c_str() << endl;
  }
  if(hdr.authentication_needed) {
    cout << "Authentication required" << endl;
  }
  if(hdr.authentication_scheme.length() > 0) {
    cout << "Authentication scheme: " << hdr.authentication_scheme.c_str()
	 << endl;
  }
  if(hdr.realm.length() > 0) {
    cout << "Authentication realm: " << hdr.realm.c_str() << endl;
  }
  if(hdr.auth_cookie.length() > 0) {
    cout << "Authentication cookie: " << hdr.auth_cookie.c_str() << endl;
  }
  if(hdr.content_encoding.length() > 0) {
    cout << "Content encoding: " << hdr.content_encoding.c_str() << endl;
  }
  if(hdr.pragma.length() > 0) {
    cout << "Pragma: " << hdr.pragma.c_str() << endl;
  }
  if(hdr.cache_control.length() > 0) {
    cout << "Cache control: " << hdr.cache_control.c_str() << endl;
  }
  
  if(hdr.file_extension.length() > 0) {
    cout << "File extension: " << hdr.file_extension.c_str() << endl;
  }
  if(hdr.length > -1) {
    cout << "Document length: " << hdr.length << endl;
  }
  if(hdr.not_found) {
    cout << "The requested document was not found" << endl;
  }
  if(!hdr.no_cache) {
    cout << "Using cached copy of the requested document" << endl;
  }
  if(hdr.accept_ranges) {
    cout << "Accepting ranges" << endl;
  }
  if(hdr.timeout > -1) {
    cout << "Timeout: " << hdr.timeout << endl;
  }
  if(hdr.max_conns > -1) {
    cout << "Max connects: " << hdr.max_conns << endl;
  }
  if(!hdr.keep_alive) {
    cout << "The server has closed this connection" << endl;
  }

  gxListNode<gxsNetscapeCookie> *netscape_cookies = \
    hdr.netscape_cookies.GetHead();
  if(netscape_cookies) {
    cout << endl;
    cout << "Cookie information. Press Enter to continue..." << endl;
    cin.get();

    while(netscape_cookies) {
      gxsNetscapeCookie citem(netscape_cookies->data);

      cout << "Hostname: " << citem.host.c_str() << endl;
      cout << "Name: " << citem.name.c_str() << endl;
      cout << "Value: " << citem.value.c_str() << endl;
      if(citem.expires.length() > 0) {
	cout << "Expires: " << citem.expires.c_str() << endl;
      }
      if(citem.domain.length() > 0) {
	cout << "Domain: " << citem.domain.c_str() << endl;
      }
      if(citem.path.length() > 0) {
	cout << "Path: " << citem.path.c_str() << endl;
      }
      if(citem.secure) {
	cout << "This is a secure cookie" << endl;
      }

      netscape_cookies = netscape_cookies->next;
      
      if(netscape_cookies) {
	cout << endl;
	cout << "Cookie information. Press Enter to continue..." << endl;
	cin.get();
      }
    }
  }
#endif
}

void hgPrintURLInfo(const gxsURLInfo &u)
{
#if defined(__CONSOLE__)
  cout << "Uniform resource locator infomation" << endl;
  cout << "-------------------------------------------------------" << endl;
  cout << "Unchanged URL = " << u.url << endl; 
  cout << "URL protocol = " << u.proto << endl; 
  cout << "Extracted hostname = " << u.host << endl;
  cout << "Port number = " << u.port << endl;
  cout << "FTP type = " << u.ftp_type << endl;
  cout << "Path = " << u.path << endl;
  cout << "Directory = " << u.dir << endl;
  cout << "File = " << u.file << endl;
  cout << "Username = " << u.user << endl;
  cout << "Password = " << u.passwd << endl; 
  cout << "Local filename of the URL document = " << u.local << endl;
  cout << "Source that requested URI was obtained = " << u.referer << endl;

  if(u.proxy) { // The exact string to pass to proxy server
    cout << endl;
    cout << "Proxy server infomation" << endl;
    cout << "-------------------------------------------------------" << endl;
    cout << "Unchanged URL = " << u.proxy->url << endl; 
    cout << "URL protocol = " << u.proxy->proto << endl; 
    cout << "Extracted hostname = " << u.proxy->host << endl;
    cout << "Port number = " << u.proxy->port << endl;
    cout << "FTP type = " << u.proxy->ftp_type << endl;
    cout << "Path = " << u.proxy->path << endl;
    cout << "Directory = " << u.proxy->dir << endl;
    cout << "File = " << u.proxy->file << endl;
    cout << "Username = " << u.proxy->user << endl;
    cout << "Password = " << u.proxy->passwd << endl; 
    cout << "Local filename of the URL document = " << u.proxy->local << endl;
    cout << "Source that requested URI was obtained = "
	 << u.proxy->referer << endl;
  }
  cout << endl;
#endif
}
// ----------------------------------------------------------- //
// ------------------------------- //
// --------- End of File --------- //
// ------------------------------- //
