minte9
LearnRemember



Image to text

Open image and extract the containing text string.
 
"""Image to text
Open Cv (Open Source Computer Vision Library) used to load images.
Tesseract used for optical character recognition.
"""
import os
DIR = os.path.dirname(os.path.realpath(__file__))

import cv2, pytesseract
img = cv2.imread(DIR + '/files/01.png')
text = pytesseract.image_to_string(img)

print(text)

"""
I've also done a lot of testing since LiveJournal.
Once I started working with other people
especially. And once I realized that code I write
never fucking goes away and I'm going to be a
maintainer for life. I get comments about blog
posts that are almost 10 years old. “Hey, I found
this code. I found a bug,” and I'm suddenly
maintaining code.
"""

Highlighted

Converting to HSV makes color selection easier.
 
"""Image to text
Get only highlighted text
"""
import os
import cv2, pytesseract
import numpy as np
DIR = os.path.dirname(os.path.realpath(__file__))

def imread_highlighted(img):

    # Convert BGR to HSV
    hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV) 

    # Range of yellow color in HSV
    lower = np.array([22, 93, 0]) 
    upper = np.array([45, 255, 255])

    # Mask to get only yellow colors
    mask = cv2.inRange(hsv, lower, upper)

    # Bitwise-AND mask and original image
    res = cv2.bitwise_and(img, img, mask= mask)

    # Invert the mask to get black letters on white background
    res2 = cv2.bitwise_not(mask)

    # Display images
    if False:
        cv2.imshow("img", res)
        cv2.imshow("img2", res2)
        cv2.waitKey(0)
        cv2.destroyAllWindows()

    return res2


img = cv2.imread(DIR + '/files/01.png')
img2 = imread_highlighted(img)

text = pytesseract.image_to_string(img).strip()
highlighted = pytesseract.image_to_string(img2).strip()
replaced = text.replace(highlighted, '<i>%s</i>' % highlighted)

print('Text: \n' + text, '\n')
print('Highlighted: \n' + highlighted, '\n')
print('Replaced: \n' + replaced, '\n')

"""
Text: 
I've also done a lot of testing since LiveJournal.
Once I started working with other people
especially. And once I realized that code I write
never fucking goes away and I'm going to be a
maintainer for life. I get comments about blog
posts that are almost 10 years old. “Hey, I found
this code. I found a bug,” and I'm suddenly
maintaining code. 

Highlighted: 
Once I started working with other people
especially. And once I realized that code I write
never fucking goes away and I'm going to be a
maintainer for life. 

Replaced: 
I've also done a lot of testing since LiveJournal.
<i>Once I started working with other people
especially. And once I realized that code I write
never fucking goes away and I'm going to be a
maintainer for life.</i> I get comments about blog
posts that are almost 10 years old. “Hey, I found
this code. I found a bug,” and I'm suddenly
maintaining code. 
"""

Corrections

Some characters might not be correctly detected (more work to do).
 
"""Image to text - highlight mask
Some character might not be correctly detected.
We can reduse noise, remove new lines, use string slices, 
check punctuation, etc ...
"""
import os, sys
import cv2, pytesseract, numpy as np
import re
DIR = os.path.dirname(os.path.realpath(__file__))

def imread_highlighted(img):
    hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
    lower = np.array([22, 93, 0])
    upper = np.array([45, 255, 255])
    mask = cv2.inRange(hsv, lower, upper)
    res = cv2.bitwise_and(img, img, mask= mask)
    res2 = cv2.bitwise_not(mask)
    res2 = cv2.GaussianBlur(res2, (3,3), 0) # reduse noise

    if False:
        cv2.imshow("img", res)
        cv2.imshow("img2", res2)
        cv2.waitKey(0)
        cv2.destroyAllWindows()

    return res2

def highlighted_replaced(img, img2):
    text = pytesseract.image_to_string(img).strip()
    highlighted = pytesseract.image_to_string(img2).strip()

    text = '\n'.join(text.split('\n\n')) # remove double new lines
    highlighted = '\n'.join(highlighted.split('\n\n'))

    pattern = re.compile(r',$') # wrong , end of propostion
    highlighted = pattern.sub(r'', highlighted) # remove

    start = highlighted[0:10].strip() # start of highlighted text
    end = highlighted[-10:].strip()

    replaced = text.replace(start, '<i>%s' % start)
    replaced = replaced.replace(end, '%s</i>' % end)

    if False:
        print('Text: \n' + text, '\n')
        print('Highlighted: \n' + highlighted, '\n')
        print('Start: \n' + start, '\n')
        print('End: \n' + end, '\n')
        print('Replaced: \n' + replaced, '\n')

    try:
        assert (start in text), '<%s> not in text' % start
        assert (end in text), '<%s> not in text' % end
    except AssertionError as e:
        print('AssertionError:\n', e)

    return replaced

for root, dirs, files in os.walk(DIR + '/files/'):
    for file in files:
        img = cv2.imread(DIR + '/files/' + file)
        img2 = imread_highlighted(img)
        replaced = highlighted_replaced(img, img2)
        print(replaced, '\n')

"""
where it was writing some big file. <i>We took really
good advantage of multithreading in Java, which
was less painful than I had expected it to be. It was
just really pleasant to work on.</i> From the API we
had designed we saw all these directions it could
grow. 

I've also done a lot of testing since LiveJournal.
<i>Once I started working with other people
especially. And once I realized that code I write
never fucking goes away and I'm going to be a
maintainer for life.</i> I get comments about blog
posts that are almost 10 years old. “Hey, I found
this code. I found a bug,” and I'm suddenly
maintaining code. 

Your competitor's six-month 1.0 has crap code and
<i>they're going to have to rewrite it in two years but</i>,
guess what: they can rewrite it because you don't
have ajob anymore. 
"""



  Last update: 222 days ago