Tesseract 3.01
/data/source/tesseract-ocr/cube/cube_utils.h
Go to the documentation of this file.
00001 /**********************************************************************
00002  * File:        cube_utils.h
00003  * Description: Declaration of the Cube Utilities Class
00004  * Author:    Ahmad Abdulkader
00005  * Created:   2008
00006  *
00007  *(C) Copyright 2008, Google Inc.
00008  ** Licensed under the Apache License, Version 2.0(the "License");
00009  ** you may not use this file except in compliance with the License.
00010  ** You may obtain a copy of the License at
00011  ** http://www.apache.org/licenses/LICENSE-2.0
00012  ** Unless required by applicable law or agreed to in writing, software
00013  ** distributed under the License is distributed on an "AS IS" BASIS,
00014  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00015  ** See the License for the specific language governing permissions and
00016  ** limitations under the License.
00017  *
00018  **********************************************************************/
00019 
00020 // The CubeUtils class provides miscellaneous utility and helper functions
00021 // to the rest of the Cube Engine
00022 
00023 #ifndef CUBE_UTILS_H
00024 #define CUBE_UTILS_H
00025 
00026 #include <vector>
00027 #include <string>
00028 
00029 #include "allheaders.h"
00030 #include "const.h"
00031 #include "char_set.h"
00032 #include "char_samp.h"
00033 #include "img.h"
00034 
00035 namespace tesseract {
00036 class CubeUtils {
00037  public:
00038   CubeUtils();
00039   ~CubeUtils();
00040 
00041   // Converts a probability value to a cost by getting the -log() of the
00042   // probability value to a known base
00043   static int Prob2Cost(double prob_val);
00044   // Converts a cost to probability by getting the exp(-normalized cost)
00045   static double Cost2Prob(int cost);
00046   // Computes the length of a 32-bit char buffer
00047   static int StrLen(const char_32 *str);
00048   // Compares two 32-bit char buffers
00049   static int StrCmp(const char_32 *str1, const char_32 *str2);
00050   // Duplicates a 32-bit char buffer
00051   static char_32 *StrDup(const char_32 *str);
00052   // Creates a CharSamp from an IMAGE and a bounding box
00053   static CharSamp *CharSampleFromImg(IMAGE *img,
00054                                      int left, int top, int wid, int hgt);
00055   // Creates a CharSamp from an Pix and a bounding box
00056   static CharSamp *CharSampleFromPix(Pix *pix,
00057                                      int left, int top, int wid, int hgt);
00058   // Creates an IMAGE from a CharSamp
00059   static IMAGE *ImageFromCharSample(CharSamp *char_samp);
00060   // Creates a Pix from a CharSamp
00061   static Pix *PixFromCharSample(CharSamp *char_samp);
00062   // read the contents of a file to a string
00063   static bool ReadFileToString(const string &file_name, string *str);
00064   // split a string into vectors using any of the specified delimiters
00065   static void SplitStringUsing(const string &str, const string &delims,
00066                                vector<string> *str_vec);
00067   // UTF-8 to UTF-32 convesion functions
00068   static void UTF8ToUTF32(const char *utf8_str, string_32 *str32);
00069   static void UTF32ToUTF8(const char_32 *utf32_str, string *str);
00070   // Returns true if input word has either 1) all-one-case, or 2)
00071   // first character upper-case, and remaining characters lower-case.
00072   // If char_set and unicharset are not NULL, uses tesseract's unicharset
00073   // functions to determine case properties. Otherwise, uses
00074   // C-locale-dependent functions, which may be unreliable on
00075   // non-ASCII characters.
00076   static bool IsCaseInvariant(const char_32 *str32, CharSet *char_set,
00077                               UNICHARSET *unicharset);
00078   // Returns char_32 pointer to the lower-case-transformed version of
00079   // the input string or NULL on error. If char_set or unicharset are
00080   // NULL, or tesseract and cube do not share unicharsets, returns
00081   // NULL. Return array must be freed by caller.
00082   static char_32 *ToLower(const char_32 *str32, CharSet *char_set,
00083                           UNICHARSET *unicharset);
00084   // Returns char_32 pointer to the upper-case-transformed version of
00085   // the input string or NULL on error. If char_set or unicharset are
00086   // NULL, or tesseract and cube do not share unicharsets, returns
00087   // NULL. Return array must be freed by caller.
00088   static char_32 *ToUpper(const char_32 *str32, CharSet *char_set,
00089                           UNICHARSET *unicharset);
00090  private:
00091   static unsigned char *GetImageData(IMAGE *img,
00092                                      int left, int top, int wid, int hgt);
00093   static unsigned char *GetImageData(Pix *pix,
00094                                      int left, int top, int wid, int hgt);
00095 };
00096 }  // namespace tesseract
00097 #endif  // CUBE_UTILS_H
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines