httpd-dev mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From Rob Hartill <r...@imdb.com>
Subject Re: Apache 1.1 charset handling patch for review
Date Fri, 05 Jul 1996 15:33:57 GMT

Thanks for the explanation and patch. It will now be considered by
the developers for 1.2 or 2.0


regards,
rob

>--ELM836576090-654-0_
>Content-Type: text/plain; charset=US-ASCII
>Content-Transfer-Encoding: 7bit
>
>One of the main problems for Russian langauge (or any language with charset
>not equal to ISO-8859-1) WWW sites is choosing
>right charset matching client capabilities. Russians have at least 4 active
>different Russian charsets (I know that the same problem exist for
>Japanese too f.e.) This problem can be solved by using "Accept-Charset:"
>from client side and finding proper document at server side
>(according to latest IETF-HTTP-V11 draft). .var mechanism in Apache is
>well applicable here, but when I try to use
>	"Content-Type=text/html; charset=KOI8-R"
>into .var file, I notice that Apache 1.1.0 ignores charset parameter
>completely, so I made this patch which uses "charset=" from .var
>files, senses client "Accept-Charset:" field and uses charset quality
>parameter "qc" according to latest IETF-HTTP-V11 draft. Moreover, as
>I hear HTTP 1.1 is moving towards making "charset=" as required
>parameter, so it makes this patch more useful.
>Most of clients for now not use "Accept-Charset", so it is new problem
>to guess client charset. According IETF-HTTP-V11 draft, when charset
>not specified, any charset is acceptable, so guessed charset is
>acceptable too. My patch tries to guess charset assuming "one charset
>per operating system" paradigm, it is configurable feature.
>Please apply this patch or at least tell me, how I can improve things.
>Thanx in advance.
>
>-- 
>Andrey A. Chernov
><ache@nagual.ru>
>http://www.nagual.ru/~ache/
>
>--ELM836576090-654-0_
>Content-Type: text/plain
>Content-Disposition: attachment; filename=mod.patch
>Content-Description: mod.patch
>Content-Transfer-Encoding: 7bit
>
>*** conf/srm.conf-dist.orig	Fri Jul  5 06:41:53 1996
>--- conf/srm.conf-dist	Fri Jul  5 06:41:54 1996
>***************
>*** 122,127 ****
>--- 122,134 ----
>  
>  LanguagePriority en fr de
>  
>+ # GuessCharset allows you to do charset guessing for clients which
>+ # forget to specify Accept-Charset header field. Guessing based on
>+ # User-Agent header field pattern.
>+ # Format: GuessCharset user-agent_pattern accept-charset_value
>+ # user-agent_pattern may contain '*' and '?' shell meta-characters
>+ # Example: GuessCharset "Mozilla/* (X11;*" "koi8-r; q=0.8"
>+ 
>  # Redirect allows you to tell clients about documents which used to exist in
>  # your server's namespace, but do not anymore. This allows you to tell the
>  # clients where to look for the relocated document.
>*** src/mod_negotiation.c.orig	Sun Jun  9 05:12:21 1996
>--- src/mod_negotiation.c	Fri Jul  5 06:45:29 1996
>***************
>*** 71,76 ****
>--- 71,77 ----
>  
>  typedef struct {
>      array_header *language_priority;
>+     table *charset_patterns;    /* Added with GuessCharset... */
>  } neg_dir_config;
>  
>  module negotiation_module;
>***************
>*** 81,86 ****
>--- 82,88 ----
>        (neg_dir_config *) palloc (p, sizeof (neg_dir_config));
>  
>      new->language_priority = make_array (p, 4, sizeof (char *));
>+     new->charset_patterns = make_table (p, 4);
>      return new;
>  }
>  
>***************
>*** 94,99 ****
>--- 96,103 ----
>      /* give priority to the config in the subdirectory */
>      new->language_priority = append_arrays (p, add->language_priority,
>  					    base->language_priority);
>+     new->charset_patterns = overlay_tables (p, add->charset_patterns,
>+ 					    base->charset_patterns);
>      return new;
>  }
>  
>***************
>*** 114,119 ****
>--- 118,129 ----
>      return NULL;
>  }
>  
>+ char *set_guess_charset (cmd_parms *cmd, neg_dir_config *m, char *pattern, char *charset)
>+ {
>+     table_set (m->charset_patterns, pattern, charset);
>+     return NULL;
>+ }
>+ 
>  int do_cache_negotiated_docs (server_rec *s)
>  {
>      return (get_module_config (s->module_config, &negotiation_module) != NULL);
>***************
>*** 124,129 ****
>--- 134,140 ----
>      NULL },
>  { "LanguagePriority", set_language_priority, NULL, OR_FILEINFO, ITERATE,
>      NULL },
>+ { "GuessCharset", set_guess_charset, NULL, OR_FILEINFO, TAKE2, NULL },
>  { NULL }
>  };
>  
>***************
>*** 139,145 ****
>--- 150,158 ----
>  
>  typedef struct accept_rec {
>      char *type_name;
>+     char *charset;
>      float quality;
>+     float qc;
>      float max_bytes;
>      float level;
>  } accept_rec;
>***************
>*** 168,175 ****
>--- 181,190 ----
>      char *file_name;
>      char *content_encoding;
>      char *content_language;
>+     char *charset;
>      float level;		/* Auxiliary to content-type... */
>      float qs;
>+     float qc;
>      float bytes;
>      int lang_index;
>      int is_pseudo_html;		/* text/html, *or* the INCLUDES_MAGIC_TYPEs */
>***************
>*** 195,200 ****
>--- 210,216 ----
>      array_header *accepts;	/* accept_recs */
>      array_header *accept_encodings;	/* accept_recs */
>      array_header *accept_langs;	/* accept_recs */
>+     array_header *accept_charsets; /* accept_recs */
>      array_header *avail_vars;	/* available variants */
>  } negotiation_state;
>  
>***************
>*** 209,219 ****
>--- 225,237 ----
>      mime_info->file_name = "";
>      mime_info->content_encoding = "";
>      mime_info->content_language = "";
>+     mime_info->charset = "";
>  
>      mime_info->is_pseudo_html = 0;
>      mime_info->level = 0.0;
>      mime_info->level_matched = 0.0;
>      mime_info->qs = 0.0;
>+     mime_info->qc = 0.0;
>      mime_info->quality = 0.0;
>      mime_info->bytes = 0;
>      mime_info->lang_index = -1;
>***************
>*** 227,233 ****
>--- 245,253 ----
>  void set_mime_fields (var_rec *var, accept_rec *mime_info)
>  {
>      var->type_name = mime_info->type_name;
>+     var->charset = mime_info->charset;
>      var->qs = mime_info->quality;
>+     var->qc = mime_info->qc;
>      var->quality = mime_info->quality; /* Initial quality is just qs */
>      var->level = mime_info->level;
>  
>***************
>*** 251,257 ****
>--- 271,279 ----
>  char *get_entry (pool *p, accept_rec *result, char *accept_line)
>  {
>      result->quality = 1.0;
>+     result->qc = 1.0;
>      result->max_bytes = 0.0;
>+     result->charset = "";
>      result->level = 0.0;
>      
>      /* Note that this handles what I gather is the "old format",
>***************
>*** 307,317 ****
>--- 329,349 ----
>  	if (parm[0] == 'q'
>  	    && (parm[1] == '\0' || (parm[1] == 's' && parm[2] == '\0')))
>  	    result->quality = atof(cp);
>+ 	else if (parm[0] == 'q' && parm[1] == 'c' && parm[2] == '\0')
>+ 	    result->qc = atof(cp);
>  	else if (parm[0] == 'm' && parm[1] == 'x' &&
>  		 parm[2] == 'b' && parm[3] == '\0')
>  	    result->max_bytes = atof(cp);
>  	else if (parm[0] == 'l' && !strcmp (&parm[1], "evel"))
>  	    result->level = atof(cp);
>+ 	else if (parm[0] == 'c' && !strcmp (&parm[1], "harset")) {
>+ 	    result->charset = cp;
>+ 	    if ((cp = strchr (result->charset, '\n')) != NULL)
>+ 		*cp = '\0';
>+ 	    if ((cp = strrchr (result->charset, '"')) != NULL)
>+ 		*cp = '\0';
>+ 	    str_tolower (result->charset);
>+ 	}
>      }
>  
>      if (*accept_line == ',') ++accept_line;
>***************
>*** 344,349 ****
>--- 376,409 ----
>   * Handling header lines from clients...
>   */
>  
>+ char *get_accept_charset (request_rec *r)
>+ {
>+     char *s;
>+ 
>+     if ((s = table_get (r->headers_in, "Accept-charset")) == NULL) {
>+ 	neg_dir_config *conf =
>+ 	     (neg_dir_config *) get_module_config (r->per_dir_config,
>+ 						   &negotiation_module);
>+ 	if (conf != NULL) {
>+ 	    char *agent = table_get (r->headers_in, "User-Agent");
>+ 
>+ 	    if (agent != NULL) {
>+ 		table *t = conf->charset_patterns;
>+ 		table_entry *elts = (table_entry *)t->elts;
>+ 		int i;
>+ 
>+ 		for (i = 0; i < t->nelts; ++i) {
>+ 		    if (!strcmp_match (agent, elts[i].key)) {
>+ 			s = elts[i].val;
>+ 			break;
>+ 		    }
>+ 		}
>+ 	    }
>+ 	}
>+     }
>+     return s;
>+ }
>+ 
>  negotiation_state *parse_accept_headers (request_rec *r)
>  {
>      negotiation_state *new =
>***************
>*** 359,364 ****
>--- 419,426 ----
>        do_header_line (r->pool, table_get (hdrs, "Accept-encoding"));
>      new->accept_langs =
>        do_header_line (r->pool, table_get (hdrs, "Accept-language"));
>+     new->accept_charsets =
>+       do_header_line (r->pool, get_accept_charset (r));
>      new->avail_vars = make_array (r->pool, 40, sizeof (var_rec));
>  
>      return new;
>***************
>*** 377,384 ****
>--- 439,448 ----
>    
>      new_accept->type_name = CGI_MAGIC_TYPE;
>      new_accept->quality = prefer_scripts ? 1e-20 : 1e20;
>+     new_accept->qc = 1.0;
>      new_accept->level = 0.0;
>      new_accept->max_bytes = 0.0;
>+     new_accept->charset = "";
>  
>      if (neg->accepts->nelts > 1) return;
>      
>***************
>*** 386,393 ****
>--- 450,459 ----
>      
>      new_accept->type_name = "*/*";
>      new_accept->quality = 1.0;
>+     new_accept->qc = 1.0;
>      new_accept->level = 0.0;
>      new_accept->max_bytes = 0.0;
>+     new_accept->charset = "";
>  }
>  
>  /*****************************************************************
>***************
>*** 712,717 ****
>--- 778,810 ----
>      return OK;
>  }
>  
>+ float charset_quality (negotiation_state *neg, var_rec *avail)
>+ {
>+     accept_rec *accs;
>+     char *charset;
>+     int i;
>+ 
>+     /* If no Accept-Charset is present, everything is acceptable */
>+ 
>+     if (!neg->accept_charsets->nelts)
>+ 	return 1.0;
>+ 
>+     charset = avail->charset;
>+     if (!*charset)
>+ 	charset = "iso-8859-1"; /* default */
>+ 
>+     accs = (accept_rec *)neg->accept_charsets->elts;
>+ 
>+     for (i = 0; i < neg->accept_charsets->nelts; ++i)
>+ 	if (!strcmp (charset, accs[i].type_name))
>+ 	    return accs[i].quality;
>+ 
>+     if (!strcmp (charset, "iso-8859-1"))
>+ 	return 1.0;
>+ 	    
>+     return 0.0;
>+ }
>+ 
>  /* This code implements a piece of the tie-breaking algorithm between
>   * variants of equal quality.  This piece is the treatment of variants
>   * of the same base media type, but different levels.  What we want to
>***************
>*** 943,954 ****
>  	for (j = 0; j < neg->avail_vars->nelts; ++j) {
>  	    
>  	    var_rec *variant = &avail_recs[j];
>! 	    float q = type->quality * variant->quality;
>  		
>  	    /* If we've already rejected this variant, don't waste time */
>  	    
>  	    if (q == 0.0) continue;	
>  	    
>  	    /* If media types don't match, forget it.
>  	     * (This includes the level check).
>  	     */
>--- 1036,1051 ----
>  	for (j = 0; j < neg->avail_vars->nelts; ++j) {
>  	    
>  	    var_rec *variant = &avail_recs[j];
>! 	    float q = type->quality * variant->quality * variant->qc;
>  		
>  	    /* If we've already rejected this variant, don't waste time */
>  	    
>  	    if (q == 0.0) continue;	
>  	    
>+ 	    q *= charset_quality(neg, variant);
>+ 
>+ 	    if (q == 0.0) continue;	
>+ 
>  	    /* If media types don't match, forget it.
>  	     * (This includes the level check).
>  	     */
>*** src/util_script.c.orig	Mon Jun  3 16:04:20 1996
>--- src/util_script.c	Fri Jul  5 06:41:54 1996
>***************
>*** 61,66 ****
>--- 61,68 ----
>  #include "http_request.h"       /* for sub_req_lookup_uri() */
>  #include "util_script.h"
>  
>+ extern char *get_accept_charset (request_rec *r);
>+ 
>  /*
>   * Various utility functions which are common to a whole lot of
>   * script-type extensions mechanisms, and might as well be gathered
>***************
>*** 130,136 ****
>      conn_rec *c = r->connection;
>      const char *rem_logname;
>      
>!     char port[40],*env_path;
>      
>      array_header *hdrs_arr = table_elts (r->headers_in);
>      table_entry *hdrs = (table_entry *)hdrs_arr->elts;
>--- 132,138 ----
>      conn_rec *c = r->connection;
>      const char *rem_logname;
>      
>!     char port[40],*env_path,*accept_charset;
>      
>      array_header *hdrs_arr = table_elts (r->headers_in);
>      table_entry *hdrs = (table_entry *)hdrs_arr->elts;
>***************
>*** 155,163 ****
>--- 157,169 ----
>  	    table_set (e, "CONTENT_LENGTH", hdrs[i].val);
>  	else if (!strcasecmp (hdrs[i].key, "Authorization"))
>  	    continue;
>+ 	else if (!strcasecmp (hdrs[i].key, "Accept-charset"))
>+ 	    continue;   /* do it later */
>  	else
>  	    table_set (e, http2env (r->pool, hdrs[i].key), hdrs[i].val);
>      }
>+     if ((accept_charset = get_accept_charset (r)) != NULL)
>+ 	table_set (e, "HTTP_ACCEPT_CHARSET", accept_charset);
>      
>      sprintf(port, "%d", s->port);
>  
>
>--ELM836576090-654-0_--

-- 
Rob Hartill (robh@imdb.com)
The Internet Movie Database (IMDb)  http://www.imdb.com/
           ...more movie info than you can poke a stick at.

Mime
View raw message