几个有用的脚本备记

tesseract sh训练脚本

#! /bin/bash

# build the environment
mkdir tessenv; cd tessenv 
TROOT=`pwd` 
mkdir $TROOT/stockfonts; mkdir $TROOT/build; mkdir $TROOT/build/eng 
echo "Environment built"
# Get the stock english fonts from Google (old, but they work)
cd $TROOT/stockfonts
GET http://tesseract-ocr.googlecode.com/files/boxtiff-2.01.eng.tar.gz > boxtiff-2.01.eng.tar.gz
echo "Google box/tiff tar.gz loaded"


# unpack the fonts, a new english (eng) directory is created with tif/box files
tar -xzf boxtiff-2.01.eng.tar.gz
echo "box/tiff file unpacked"


# Move the arial font data into the build space (yes, the exp0 is required)
mv $TROOT/stockfonts/eng/eng.arial.g4.tif $TROOT/build/eng.arial.exp0.tif
mv $TROOT/stockfonts/eng/eng.arial.box $TROOT/build/eng.arial.exp0.box
echo "ariel box/tif moved and renamed"
cd $TROOT/build
# Create the font_properties file
echo "arial 0 0 0 0 0" > font_properties

# BEGIN BUILDING NEW eng.traineddata
tesseract eng.arial.exp0.tif eng.arial.exp0 nobatch box.train
unicharset_extractor eng.arial.exp0.box
shapeclustering -F font_properties -U unicharset  eng.arial.exp0.tr
mftraining -F font_properties -U unicharset -O eng.unicharset eng.arial.exp0.tr
cntraining eng.arial.exp0.tr
echo "eng.traineddata complete"

# BEGIN combining into an eng.traineddata set
# Note files are moved into an isoloated directory for combiing
# Note files have language prefix added

cp eng.unicharset $TROOT/build/eng/eng.unicharset
cp normproto $TROOT/build/eng/eng.normproto
cp inttemp $TROOT/build/eng/eng.inttemp
cp pffmtable $TROOT/build/eng/eng.pffmtable
cp shapetable $TROOT/build/eng/eng.shapetable

cd $TROOT/build/eng
combine_tessdata eng.

# You now have an eng.trainedddata file in your $TROOT/build/eng directory
# You must move this file to your /usr/local/share/tessdata directory.
# You will need sudo permission. 
# BE SURE to back up your old eng.traineddata FIRST
# Recommend testing your new tesseract with the eng.arial.exp0.tif file in
# the build directory.

opencv 文本图片预处理

# -*- coding: UTF-8 -*-
import cv2
def  digitsimg(src):
    
    #灰度化
    img_gray = cv2.cvtColor(src,cv2.COLOR_BGR2GRAY)
    #Otsu thresholding 二值化
    ret,result= cv2.threshold(img_gray,0,255,cv2.THRESH_BINARY+cv2.THRESH_OTSU)
    #腐蚀去除一些小的点
    kernel = cv2.getStructuringElement(cv2.MORPH_CROSS,(3,2))
    eroded = cv2.erode(result,kernel)
    #将结果放大便于识别
    result = cv2.resize(result,(128,128),interpolation=cv2.INTER_CUBIC)
   # cv2.imshow('result',result)
   # cv2.waitKey(0)
    #腐蚀去除放大后的一些小的点
    eroded = cv2.erode(result,kernel)
  #  cv2.imshow('eroded',eroded)
  #  cv2.waitKey(0)
    #膨胀使数字更饱满
    result = cv2.dilate(eroded,kernel)
 #   cv2.imshow('dilated',result)
    #直方图均衡化使图像更清晰
    cv2.equalizeHist(result)
    #中值滤波去除噪点
    result = cv2.medianBlur(result,5)
#    cv2.imshow('median',result)
 #   cv2.waitKey(0)
    return result
'''
def chineseimg(src):
    
    #灰度化
    img_gray = cv2.cvtColor(src,cv2.COLOR_BGR2GRAY)
    #Otsu thresholding 二值化
    ret,result= cv2.threshold(img_gray,0,255,cv2.THRESH_BINARY+cv2.THRESH_OTSU)
  #  cv2.imshow('otsu',result)
  #  cv2.waitKey(0)
    #直方图均衡化使图像更清晰
    cv2.equalizeHist(result)
  #  cv2.imshow('直方图',result)
 #   cv2.waitKey(0)
    return result
    #将结果放大便于识别
    result = cv2.resize(result,(256,128),interpolation=cv2.INTER_CUBIC)
    #腐蚀去除放大后的一些小的点
    kernel = cv2.getStructuringElement(cv2.MORPH_CROSS,(3,2))
    eroded = cv2.erode(result,kernel)
    cv2.imshow('eroded',eroded)
    cv2.waitKey(0)
    #膨胀使数字更饱满
    result = cv2.dilate(eroded,kernel)
    cv2.imshow('dilated',result)
    cv2.waitKey(0)
    #直方图均衡化使图像更清晰
    cv2.equalizeHist(result)
    #中值滤波去除噪点
    result = cv2.medianBlur(result,5)
    cv2.imshow('median',result)
    cv2.waitKey(0)'''
    

https://coding.net/u/mengning/p/np2016/git/blob/master/BloodTestReportOCR/imgproc.py