To: vim-dev@vim.org Subject: Patch 6.2.511 Fcc: outbox From: Bram Moolenaar Mime-Version: 1.0 Content-Type: text/plain; charset=ISO-8859-1 Content-Transfer-Encoding: 8bit ------------ Patch 6.2.511 Problem: Tags in Russian help files are in utf-8 encoding, which may be different from 'encoding'. Solution: Use the "TAG_FILE_ENCODING" field in the tags file to specify the encoding of the tags. Convert help tags from 'encoding' to the tag file encoding when searching for matches, do the reverse when listing help tags. Files: runtime/doc/tagsrch.txt, src/ex_cmds.c, src/tag.c *** ../vim-6.2.510/runtime/doc/tagsrch.txt Sun Jun 1 12:20:35 2003 --- runtime/doc/tagsrch.txt Wed Apr 28 11:58:29 2004 *************** *** 1,4 **** ! *tagsrch.txt* For Vim version 6.2. Last change: 2003 May 18 VIM REFERENCE MANUAL by Bram Moolenaar --- 1,4 ---- ! *tagsrch.txt* For Vim version 6.2. Last change: 2004 Apr 28 VIM REFERENCE MANUAL by Bram Moolenaar *************** *** 546,565 **** The first lines in the tags file can contain lines that start with !_TAG_ These are sorted to the first lines, only rare tags that start with "!" can ! sort to before them. Vim only recognizes the line that indicates if the file ! was sorted. When this line is found, Vim uses binary searching for the tags ! file: > ! !_TAG_FILE_SORTED1 ! < A tag file may be case-fold sorted to avoid a linear search when 'ignorecase' ! is on. See 'tagbsearch' for details. The value '2' should be used then: > ! !_TAG_FILE_SORTED2 ! < *tag-search* The command can be any Ex command, but often it is a search command. ! Examples: > ! tag1 file1 /^main(argc, argv)/ ! tag2 file2 108 The command is always executed with 'magic' not set. The only special characters in a search pattern are "^" (begin-of-line) and "$" (). --- 548,575 ---- The first lines in the tags file can contain lines that start with !_TAG_ These are sorted to the first lines, only rare tags that start with "!" can ! sort to before them. Vim recognizes two items. The first one is the line ! that indicates if the file was sorted. When this line is found, Vim uses ! binary searching for the tags file: ! !_TAG_FILE_SORTED1 ~ ! A tag file may be case-fold sorted to avoid a linear search when 'ignorecase' ! is on. See 'tagbsearch' for details. The value '2' should be used then: ! !_TAG_FILE_SORTED2 ~ ! ! The other tag that Vim recognizes, but only when compiled with the ! |+multi_byte| feature, is the encoding of the tags file: ! !_TAG_FILE_ENCODINGutf-8 ~ ! Here "utf-8" is the encoding used for the tags. Vim will then convert the tag ! being searched for from 'encoding' to the encoding of the tags file. And when ! listing tags the reverse happens. When the conversion fails the unconverted ! tag is used. ! *tag-search* The command can be any Ex command, but often it is a search command. ! Examples: ! tag1 file1 /^main(argc, argv)/ ~ ! tag2 file2 108 ~ The command is always executed with 'magic' not set. The only special characters in a search pattern are "^" (begin-of-line) and "$" (). *** ../vim-6.2.510/src/ex_cmds.c Mon Apr 19 20:26:42 2004 --- src/ex_cmds.c Wed Apr 28 15:02:02 2004 *************** *** 5275,5280 **** --- 5275,5285 ---- char_u *s; int i; char_u *fname; + # ifdef FEAT_MBYTE + int utf8 = MAYBE; + int this_utf8; + int firstline; + # endif /* * Find all *.txt files. *************** *** 5342,5349 **** --- 5347,5375 ---- } fname = gettail(files[fi]); + # ifdef FEAT_MBYTE + firstline = TRUE; + # endif while (!vim_fgets(IObuff, IOSIZE, fd) && !got_int) { + # ifdef FEAT_MBYTE + if (firstline) + { + /* Detect utf-8 file by a non-ASCII char in the first line. */ + this_utf8 = FALSE; + for (s = IObuff; *s != NUL; ++s) + if (*s >= 0x80) + this_utf8 = TRUE; + if (utf8 == MAYBE) + utf8 = this_utf8; + else if (utf8 != this_utf8) + { + EMSG2(_("E670: Mix of help file encodings within a language: %s"), files[fi]); + got_int = TRUE; + } + firstline = FALSE; + } + # endif p1 = vim_strchr(IObuff, '*'); /* find first '*' */ while (p1 != NULL) { *************** *** 5426,5431 **** --- 5452,5462 ---- ++p2; } } + + # ifdef FEAT_MBYTE + if (utf8 == TRUE) + fprintf(fd_tags, "!_TAG_FILE_ENCODING\tutf-8\t//\n"); + # endif /* * Write the tags into the file. *** ../vim-6.2.510/src/tag.c Mon Apr 19 20:26:43 2004 --- src/tag.c Wed Apr 28 14:39:14 2004 *************** *** 1005,1010 **** --- 1005,1060 ---- #endif /* + * Structure to hold info about the tag pattern being used. + */ + typedef struct + { + char_u *pat; /* the pattern */ + int len; /* length of pat[] */ + char_u *head; /* start of pattern head */ + int headlen; /* length of head[] */ + regmatch_T regmatch; /* regexp program, may be NULL */ + } pat_T; + + static void prepare_pats __ARGS((pat_T *pats, int has_re)); + + /* + * Extract info from the tag search pattern "pats->pat". + */ + static void + prepare_pats(pats, has_re) + pat_T *pats; + int has_re; + { + pats->head = pats->pat; + pats->headlen = pats->len; + if (has_re) + { + /* When the pattern starts with '^' or "\\<", binary searching can be + * used (much faster). */ + if (pats->pat[0] == '^') + pats->head = pats->pat + 1; + else if (pats->pat[0] == '\\' && pats->pat[1] == '<') + pats->head = pats->pat + 2; + if (pats->head == pats->pat) + pats->headlen = 0; + else + for (pats->headlen = 0; pats->head[pats->headlen] != NUL; + ++pats->headlen) + if (vim_strchr((char_u *)(p_magic ? ".[~*\\$" : "\\$"), + pats->head[pats->headlen]) != NULL) + break; + if (p_tl != 0 && pats->headlen > p_tl) /* adjust for 'taglength' */ + pats->headlen = p_tl; + } + + if (has_re) + pats->regmatch.regprog = vim_regcomp(pats->pat, p_magic ? RE_MAGIC : 0); + else + pats->regmatch.regprog = NULL; + } + + /* * find_tags() - search for tags in tags files * * Return FAIL if search completely failed (*num_matches will be 0, *matchesp *************** *** 1053,1059 **** char_u *p; char_u *s; int i; - regmatch_T regmatch; /* regexp program may be NULL */ #ifdef FEAT_TAG_BINS struct tag_search_info /* Binary search file offsets */ { --- 1103,1108 ---- *************** *** 1124,1132 **** char_u *saved_pat = NULL; /* copy of pat[] */ #endif ! int patlen; /* length of pat[] */ ! char_u *pathead; /* start of pattern head */ ! int patheadlen; /* length of pathead[] */ #ifdef FEAT_TAG_BINS int findall = (mincount == MAXCOL || mincount == TAG_MANY); /* find all matching tags */ --- 1173,1188 ---- char_u *saved_pat = NULL; /* copy of pat[] */ #endif ! /* Use two sets of variables for the pattern: "orgpat" holds the values ! * for the original pattern and "convpat" converted from 'encoding' to ! * encoding of the tags file. "pats" point to either one of these. */ ! pat_T *pats; ! pat_T orgpat; /* holds unconverted pattern info */ ! #ifdef FEAT_MBYTE ! pat_T convpat; /* holds converted pattern info */ ! vimconv_T vimconv; ! #endif ! #ifdef FEAT_TAG_BINS int findall = (mincount == MAXCOL || mincount == TAG_MANY); /* find all matching tags */ *************** *** 1146,1151 **** --- 1202,1212 ---- int verbose = (flags & TAG_VERBOSE); help_save = curbuf->b_help; + orgpat.pat = pat; + pats = &orgpat; + #ifdef FEAT_MBYTE + vimconv.vc_type = CONV_NONE; + #endif /* * Allocate memory for the buffers that are used *************** *** 1176,1230 **** if (help_only) /* want tags from help file */ curbuf->b_help = TRUE; /* will be restored later */ ! patlen = (int)STRLEN(pat); #ifdef FEAT_MULTI_LANG if (curbuf->b_help) { /* When "@ab" is specified use only the "ab" language, otherwise * search all languages. */ ! if (patlen > 3 && pat[patlen - 3] == '@' ! && ASCII_ISALPHA(pat[patlen - 2]) ! && ASCII_ISALPHA(pat[patlen - 1])) { ! saved_pat = vim_strnsave(pat, patlen - 3); if (saved_pat != NULL) { ! help_lang_find = &pat[patlen - 2]; ! pat = saved_pat; ! patlen -= 3; } } } #endif ! if (p_tl != 0 && patlen > p_tl) /* adjust for 'taglength' */ ! patlen = p_tl; ! ! pathead = pat; ! patheadlen = patlen; ! if (has_re) ! { ! /* When the pattern starts with '^' or "\\<", binary searching can be ! * used (much faster). */ ! if (pat[0] == '^') ! pathead = pat + 1; ! else if (pat[0] == '\\' && pat[1] == '<') ! pathead = pat + 2; ! if (pathead == pat) ! patheadlen = 0; ! else ! for (patheadlen = 0; pathead[patheadlen] != NUL; ++patheadlen) ! if (vim_strchr((char_u *)(p_magic ? ".[~*\\$" : "\\$"), ! pathead[patheadlen]) != NULL) ! break; ! if (p_tl != 0 && patheadlen > p_tl) /* adjust for 'taglength' */ ! patheadlen = p_tl; ! } ! ! if (has_re) ! regmatch.regprog = vim_regcomp(pat, p_magic ? RE_MAGIC : 0); ! else ! regmatch.regprog = NULL; #ifdef FEAT_TAG_BINS /* This is only to avoid a compiler warning for using search_info --- 1237,1266 ---- if (help_only) /* want tags from help file */ curbuf->b_help = TRUE; /* will be restored later */ ! pats->len = (int)STRLEN(pat); #ifdef FEAT_MULTI_LANG if (curbuf->b_help) { /* When "@ab" is specified use only the "ab" language, otherwise * search all languages. */ ! if (pats->len > 3 && pat[pats->len - 3] == '@' ! && ASCII_ISALPHA(pat[pats->len - 2]) ! && ASCII_ISALPHA(pat[pats->len - 1])) { ! saved_pat = vim_strnsave(pat, pats->len - 3); if (saved_pat != NULL) { ! help_lang_find = &pat[pats->len - 2]; ! pats->pat = saved_pat; ! pats->len -= 3; } } } #endif + if (p_tl != 0 && pats->len > p_tl) /* adjust for 'taglength' */ + pats->len = p_tl; ! prepare_pats(pats, has_re); #ifdef FEAT_TAG_BINS /* This is only to avoid a compiler warning for using search_info *************** *** 1242,1254 **** * Only ignore case when TAG_NOIC not used or 'ignorecase' set. */ #ifdef FEAT_TAG_BINS ! regmatch.rm_ic = ((p_ic || !noic) ! && (findall || patheadlen == 0 || !p_tbs)); for (round = 1; round <= 2; ++round) { ! linear = (patheadlen == 0 || !p_tbs || round == 2); #else ! regmatch.rm_ic = (p_ic || !noic); #endif /* --- 1278,1290 ---- * Only ignore case when TAG_NOIC not used or 'ignorecase' set. */ #ifdef FEAT_TAG_BINS ! pats->regmatch.rm_ic = ((p_ic || !noic) ! && (findall || pats->headlen == 0 || !p_tbs)); for (round = 1; round <= 2; ++round) { ! linear = (pats->headlen == 0 || !p_tbs || round == 2); #else ! pats->regmatch.rm_ic = (p_ic || !noic); #endif /* *************** *** 1566,1578 **** { state = TS_BINARY; sortic = TRUE; ! regmatch.rm_ic = (p_ic || !noic); } else state = TS_LINEAR; } ! if (state == TS_BINARY && regmatch.rm_ic && !sortic) { /* binary search won't work for ignoring case, use linear * search. */ --- 1602,1614 ---- { state = TS_BINARY; sortic = TRUE; ! pats->regmatch.rm_ic = (p_ic || !noic); } else state = TS_LINEAR; } ! if (state == TS_BINARY && pats->regmatch.rm_ic && !sortic) { /* binary search won't work for ignoring case, use linear * search. */ *************** *** 1612,1623 **** #endif } /* * Figure out where the different strings are in this line. * For "normal" tags: Do a quick check if the tag matches. * This speeds up tag searching a lot! */ ! if (patheadlen #ifdef FEAT_EMACS_TAGS && !is_etag #endif --- 1648,1687 ---- #endif } + #ifdef FEAT_MBYTE + if (lbuf[0] == '!' && pats == &orgpat + && STRNCMP(lbuf, "!_TAG_FILE_ENCODING\t", 20) == 0) + { + /* Convert the search pattern from 'encoding' to the + * specified encoding. */ + for (p = lbuf + 20; *p > ' ' && *p < 127; ++p) + ; + *p = NUL; + convert_setup(&vimconv, p_enc, lbuf + 20); + if (vimconv.vc_type != CONV_NONE) + { + convpat.pat = string_convert(&vimconv, pats->pat, NULL); + if (convpat.pat != NULL) + { + pats = &convpat; + pats->len = (int)STRLEN(pats->pat); + prepare_pats(pats, has_re); + pats->regmatch.rm_ic = orgpat.regmatch.rm_ic; + } + } + + /* Prepare for converting a match the other way around. */ + convert_setup(&vimconv, lbuf + 20, p_enc); + continue; + } + #endif + /* * Figure out where the different strings are in this line. * For "normal" tags: Do a quick check if the tag matches. * This speeds up tag searching a lot! */ ! if (pats->headlen #ifdef FEAT_EMACS_TAGS && !is_etag #endif *************** *** 1674,1682 **** cmplen = (int)(tagp.tagname_end - tagp.tagname); if (p_tl != 0 && cmplen > p_tl) /* adjust for 'taglength' */ cmplen = p_tl; ! if (has_re && patheadlen < cmplen) ! cmplen = patheadlen; ! else if (state == TS_LINEAR && patheadlen != cmplen) continue; #ifdef FEAT_TAG_BINS --- 1738,1746 ---- cmplen = (int)(tagp.tagname_end - tagp.tagname); if (p_tl != 0 && cmplen > p_tl) /* adjust for 'taglength' */ cmplen = p_tl; ! if (has_re && pats->headlen < cmplen) ! cmplen = pats->headlen; ! else if (state == TS_LINEAR && pats->headlen != cmplen) continue; #ifdef FEAT_TAG_BINS *************** *** 1695,1704 **** * Compare the current tag with the searched tag. */ if (sortic) ! tagcmp = tag_strnicmp(tagp.tagname, pathead, (size_t)cmplen); else ! tagcmp = STRNCMP(tagp.tagname, pathead, cmplen); /* * A match with a shorter tag means to search forward. --- 1759,1768 ---- * Compare the current tag with the searched tag. */ if (sortic) ! tagcmp = tag_strnicmp(tagp.tagname, pats->head, (size_t)cmplen); else ! tagcmp = STRNCMP(tagp.tagname, pats->head, cmplen); /* * A match with a shorter tag means to search forward. *************** *** 1706,1714 **** */ if (tagcmp == 0) { ! if (cmplen < patheadlen) tagcmp = -1; ! else if (cmplen > patheadlen) tagcmp = 1; } --- 1770,1778 ---- */ if (tagcmp == 0) { ! if (cmplen < pats->headlen) tagcmp = -1; ! else if (cmplen > pats->headlen) tagcmp = 1; } *************** *** 1752,1758 **** } else if (state == TS_SKIP_BACK) { ! if (MB_STRNICMP(tagp.tagname, pathead, cmplen) != 0) state = TS_STEP_FORWARD; else /* Have to skip back more. Restore the curr_offset --- 1816,1822 ---- } else if (state == TS_SKIP_BACK) { ! if (MB_STRNICMP(tagp.tagname, pats->head, cmplen) != 0) state = TS_STEP_FORWARD; else /* Have to skip back more. Restore the curr_offset *************** *** 1762,1768 **** } else if (state == TS_STEP_FORWARD) { ! if (MB_STRNICMP(tagp.tagname, pathead, cmplen) != 0) { if ((off_t)ftell(fp) > search_info.match_offset) break; /* past last match */ --- 1826,1832 ---- } else if (state == TS_STEP_FORWARD) { ! if (MB_STRNICMP(tagp.tagname, pats->head, cmplen) != 0) { if ((off_t)ftell(fp) > search_info.match_offset) break; /* past last match */ *************** *** 1773,1779 **** else #endif /* skip this match if it can't match */ ! if (MB_STRNICMP(tagp.tagname, pathead, cmplen) != 0) continue; /* --- 1837,1843 ---- else #endif /* skip this match if it can't match */ ! if (MB_STRNICMP(tagp.tagname, pats->head, cmplen) != 0) continue; /* *************** *** 1824,1863 **** if (p_tl != 0 && cmplen > p_tl) /* adjust for 'taglength' */ cmplen = p_tl; /* if tag length does not match, don't try comparing */ ! if (patlen != cmplen) match = FALSE; else { ! if (regmatch.rm_ic) { ! match = (MB_STRNICMP(tagp.tagname, pat, cmplen) == 0); if (match) ! match_no_ic = (STRNCMP(tagp.tagname, pat, cmplen) == 0); } else ! match = (STRNCMP(tagp.tagname, pat, cmplen) == 0); } /* * Has a regexp: Also find tags matching regexp. */ match_re = FALSE; ! if (!match && regmatch.regprog != NULL) { int cc; cc = *tagp.tagname_end; *tagp.tagname_end = NUL; ! match = vim_regexec(®match, tagp.tagname, (colnr_T)0); if (match) { ! matchoff = (int)(regmatch.startp[0] - tagp.tagname); ! if (regmatch.rm_ic) { ! regmatch.rm_ic = FALSE; ! match_no_ic = vim_regexec(®match, tagp.tagname, (colnr_T)0); ! regmatch.rm_ic = TRUE; } } *tagp.tagname_end = cc; --- 1888,1928 ---- if (p_tl != 0 && cmplen > p_tl) /* adjust for 'taglength' */ cmplen = p_tl; /* if tag length does not match, don't try comparing */ ! if (pats->len != cmplen) match = FALSE; else { ! if (pats->regmatch.rm_ic) { ! match = (MB_STRNICMP(tagp.tagname, pats->pat, cmplen) == 0); if (match) ! match_no_ic = (STRNCMP(tagp.tagname, pats->pat, ! cmplen) == 0); } else ! match = (STRNCMP(tagp.tagname, pats->pat, cmplen) == 0); } /* * Has a regexp: Also find tags matching regexp. */ match_re = FALSE; ! if (!match && pats->regmatch.regprog != NULL) { int cc; cc = *tagp.tagname_end; *tagp.tagname_end = NUL; ! match = vim_regexec(&pats->regmatch, tagp.tagname, (colnr_T)0); if (match) { ! matchoff = (int)(pats->regmatch.startp[0] - tagp.tagname); ! if (pats->regmatch.rm_ic) { ! pats->regmatch.rm_ic = FALSE; ! match_no_ic = vim_regexec(&pats->regmatch, tagp.tagname, (colnr_T)0); ! pats->regmatch.rm_ic = TRUE; } } *tagp.tagname_end = cc; *************** *** 1914,1920 **** else mtt = MT_GL_OTH; } ! if (regmatch.rm_ic && !match_no_ic) mtt += MT_IC_OFF; if (match_re) mtt += MT_RE_OFF; --- 1979,1985 ---- else mtt = MT_GL_OTH; } ! if (pats->regmatch.rm_ic && !match_no_ic) mtt += MT_IC_OFF; if (match_re) mtt += MT_RE_OFF; *************** *** 1927,1932 **** --- 1992,2026 ---- */ if (ga_grow(&ga_match[mtt], 1) == OK) { + #ifdef FEAT_MBYTE + char_u *conv_line = NULL; + char_u *lbuf_line = lbuf; + + if (vimconv.vc_type != CONV_NONE) + { + /* Convert the tag line from the encoding of the tags + * file to 'encoding'. Then parse the line again. */ + conv_line = string_convert(&vimconv, lbuf, NULL); + if (conv_line != NULL) + { + if (parse_tag_line(conv_line, + #ifdef FEAT_EMACS_TAGS + is_etag, + #endif + &tagp) == OK) + lbuf_line = conv_line; + else + /* doesn't work, go back to unconverted line. */ + (void)parse_tag_line(lbuf, + #ifdef FEAT_EMACS_TAGS + is_etag, + #endif + &tagp); + } + } + #else + # define lbuf_line lbuf + #endif if (help_only) { #ifdef FEAT_MULTI_LANG *************** *** 2019,2025 **** * other tag: * without Emacs tags: */ ! len = (int)STRLEN(tag_fname) + (int)STRLEN(lbuf) + 3; #ifdef FEAT_EMACS_TAGS if (is_etag) len += (int)STRLEN(ebuf) + 1; --- 2113,2119 ---- * other tag: * without Emacs tags: */ ! len = (int)STRLEN(tag_fname) + (int)STRLEN(lbuf_line) + 3; #ifdef FEAT_EMACS_TAGS if (is_etag) len += (int)STRLEN(ebuf) + 1; *************** *** 2049,2055 **** else *s++ = NUL; #endif ! STRCPY(s, lbuf); } } --- 2143,2149 ---- else *s++ = NUL; #endif ! STRCPY(s, lbuf_line); } } *************** *** 2086,2091 **** --- 2180,2189 ---- else vim_free(mfp); } + #ifdef FEAT_MBYTE + /* Note: this makes the values in "tagp" invalid! */ + vim_free(conv_line); + #endif } else /* Out of memory! Just forget about the rest. */ { *************** *** 2123,2128 **** --- 2221,2238 ---- vim_free(incstack[incstack_idx].etag_fname); } #endif + #ifdef FEAT_MBYTE + if (pats == &convpat) + { + /* Go back from converted pattern to original pattern. */ + vim_free(pats->pat); + vim_free(pats->regmatch.regprog); + orgpat.regmatch.rm_ic = pats->regmatch.rm_ic; + pats = &orgpat; + } + if (vimconv.vc_type != CONV_NONE) + convert_setup(&vimconv, NULL, NULL); + #endif #ifdef FEAT_TAG_BINS if (sort_error) *************** *** 2154,2166 **** /* stop searching when already did a linear search, or when * TAG_NOIC used, and 'ignorecase' not set * or already did case-ignore search */ ! if (stop_searching || linear || (!p_ic && noic) || regmatch.rm_ic) break; # ifdef FEAT_CSCOPE if (use_cscope) break; # endif ! regmatch.rm_ic = TRUE; /* try another time while ignoring case */ } #endif --- 2264,2276 ---- /* stop searching when already did a linear search, or when * TAG_NOIC used, and 'ignorecase' not set * or already did case-ignore search */ ! if (stop_searching || linear || (!p_ic && noic) || pats->regmatch.rm_ic) break; # ifdef FEAT_CSCOPE if (use_cscope) break; # endif ! pats->regmatch.rm_ic = TRUE; /* try another time while ignoring case */ } #endif *************** *** 2173,2179 **** findtag_end: vim_free(lbuf); ! vim_free(regmatch.regprog); vim_free(tag_fname); #ifdef FEAT_EMACS_TAGS vim_free(ebuf); --- 2283,2289 ---- findtag_end: vim_free(lbuf); ! vim_free(pats->regmatch.regprog); vim_free(tag_fname); #ifdef FEAT_EMACS_TAGS vim_free(ebuf); *** ../vim-6.2.510/src/version.c Wed Apr 28 16:14:57 2004 --- src/version.c Wed Apr 28 16:17:12 2004 *************** *** 639,640 **** --- 639,642 ---- { /* Add new patch number below this line */ + /**/ + 511, /**/ -- The future isn't what it used to be. /// Bram Moolenaar -- Bram@Moolenaar.net -- http://www.Moolenaar.net \\\ /// Sponsor Vim, vote for features -- http://www.Vim.org/sponsor/ \\\ \\\ Project leader for A-A-P -- http://www.A-A-P.org /// \\\ Buy at Amazon and help AIDS victims -- http://ICCF.nl/click1.html ///