commit b775423b1defb184ac2dad068362f42a2aa7d239 Author: Klesh Wong Date: Mon Jul 12 18:30:35 2021 +0800 [init] initial commit diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..1d17dae --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +.venv diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..5888e7f --- /dev/null +++ b/Dockerfile @@ -0,0 +1,17 @@ +FROM alpine:latest + +MAINTAINER Klesh Wong +LABEL description="Simple verification-code solver" + +WORKDIR /data + +RUN sed -i 's|dl-cdn.alpinelinux.org|mirrors.aliyun.com|g' /etc/apk/repositories +RUN apk update --no-cache \ + && apk add --update --no-cache tesseract-ocr py3-pip py3-numpy py3-pillow \ + && rm -rf /var/cache/apk/* + +ADD requirements.txt /data/requirements.txt +RUN pip3 install -r /data/requirements.txt --index-url=https://mirrors.aliyun.com/pypi/simple + +ADD main.py /data/main.py +CMD python3 /data/main.py diff --git a/main.py b/main.py new file mode 100644 index 0000000..9558c6d --- /dev/null +++ b/main.py @@ -0,0 +1,180 @@ +import pytesseract +import base64 +import numpy as np +from flask import Flask, request +from flasgger import Swagger +from io import BytesIO +from PIL import Image +from tempfile import NamedTemporaryFile +from typing import Tuple +from os import environ + + +app = Flask(__name__) +app.config['SWAGGER'] = { + 'title': 'Simple verification-code solver', +} + +Swagger(app) + + +def remove_noise(img_array: np.ndarray, dominant_color: Tuple[int, int, int], threshold=10): + img_array = img_array.copy() + h, w, c = img_array.shape + visited = set() + + grid = [ + (-1, -1), + (-1, 0), + (-1, +1), + (0, -1), + (0, +1), + (+1, -1), + (+1, 0), + (+1, +1) + ] + + def find_connected(y: int, x: int): + if y < 0 or y >= h or x < 0 or x >= w: + return + if (y, x) in visited: + return + visited.add((y, x)) + if np.array_equal(img_array[y, x], dominant_color): + return + + connected = [(y, x)] + for dy, dx in grid: + next_connected = find_connected(dy+y, dx+x) + if next_connected: + connected += next_connected + return connected + + for y in range(h): + for x in range(w): + pixel = img_array[y, x] + if pixel[0] != pixel[1] or pixel[1] != pixel[2]: + img_array[y, x] = dominant_color + + for y in range(h): + for x in range(w): + connected = find_connected(y, x) + if not connected or len(connected) > threshold: + continue + for pos in connected: + img_array[pos[0], pos[1]] = dominant_color + + return img_array + + +def sanitize(pil_img: Image) -> np.ndarray: + data = np.asarray(pil_img) + clip = data[1:-1, 1:-1] + count_colors = sorted(pil_img.getcolors(), key=lambda cc: cc[0]) + dominant_color = np.asarray(count_colors[-1][1]) + return remove_noise(clip, dominant_color) + + +@app.route('/verification-code', methods=['POST']) +def solve_verification_code(): + """ + Solve verification code + --- + tags: + - Verification Code + parameters: + - in: formData + name: image + type: file + required: true + description: Image file + - in: formData + name: timeout + type: int + required: false + description: Timeout + - in: header + name: X-Key + type: string + required: true + responses: + 500: + description: Error message + 200: + description: text + """ + + x_key = request.headers.get('X-Key') + if x_key != environ.get('X_KEY'): + return "Unauthorized", 401 + + if 'image' not in request.files: + return "No file uploaded", 400 + + image_file = request.files['image'] + + if image_file.filename == '': + return "No file name", 400 + + timeout = 300 + if "timeout" in request.form and int(request.form['timeout']) > 0: + timeout = int(request.form['timeout']) + + img = Image.open(BytesIO(image_file.read())) + ary = sanitize(img) + txt = pytesseract.image_to_string(ary, timeout=timeout) + + return txt + + +@app.route('/verification-code/base64', methods=['POST']) +def sove_verification_code_base64(): + """ + Solve verification code from base64-encoded image + --- + tags: + - Verification Code + parameters: + - in: formData + name: image + type: string + required: true + description: Base64-encoded image + - in: formData + name: timeout + type: int + required: false + description: Timeout + - in: header + name: X-Key + type: string + required: true + responses: + 500: + description: Error message + 200: + description: text + """ + + x_key = request.headers.get('X-Key') + if x_key != environ.get('X_KEY'): + return "Unauthorized", 401 + + image = request.form.get('image') + if not image: + return 'No image data', 400 + + timeout = 300 + if "timeout" in request.form and int(request.form['timeout']) > 0: + timeout = int(request.form['timeout']) + + img = Image.open(BytesIO(base64.b64decode(image))) + ary = sanitize(img) + txt = pytesseract.image_to_string(ary, timeout=timeout) + + return txt + + + +if __name__ == "__main__": + app.run(debug=environ.get("FLASK_ENV") == 'development', host='0.0.0.0', port=8000) diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..8a637d6 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,6 @@ +pillow +pytesseract +flask +flasgger +numpy +waitress