Commit bf6e7d8e authored by Ilya's avatar Ilya
Browse files

add simple docx parser

parent 15a2f88f
<w:pPr><w:pBdr><w:top w:val="single" w:sz="2" w:space="1" w:color="000000"/><w:left w:val="single" w:sz="2" w:space="1" w:color="000000"/><w:bottom w:val="single" w:sz="2" w:space="1" w:color="000000"/><w:right w:val="single" w:sz="2" w:space="1" w:color="000000"/></w:pBdr>
\ No newline at end of file
import json
import os
import random
from collections import namedtuple
from typing import Tuple, List
from zipfile import ZipFile
from shutil import copyfile
import cv2
import matplotlib.pyplot as plt
import numpy as np
from pdf2image import convert_from_bytes
Board = namedtuple("Board", ['x', 'y', 'w', 'h'])
black = open(os.path.join(os.path.dirname(__file__), "black.xml")).read()
red = open(os.path.join(os.path.dirname(__file__), "red.xml")).read()
def random_name():
return "".join((random.sample("qwertyuiopasdfghjklzxcvbnm", 12)))
def modify_one_docx(path_in: str, path_out: str, repl: str):
file_in = ZipFile(path_in).open("word/document.xml")
xml = file_in.read().decode()
new_xml = xml.replace("<w:pPr>", repl)
copyfile(path_in, path_out)
file_out = ZipFile(path_out, "a")
tmp_file_path = "/tmp/{}.xml".format(random_name())
with open(tmp_file_path, "w") as out:
out.write(new_xml)
file_out.write(tmp_file_path, "word/document.xml")
file_out.close()
os.remove(tmp_file_path)
file_out.close()
os.system("cd /tmp/ ; lowriter --headless --convert-to pdf {}; cd -".format(path_out))
os.remove(path_out)
def modify(file_in: str, file_out: str):
path_red = file_out.format("red")
modify_one_docx(file_in, path_red, red)
path_black = file_out.format("black")
modify_one_docx(file_in, path_black, black)
return path_red.replace(".docx", ".pdf"), path_black.replace(".docx", ".pdf") # TODO fix it
class ImageWithBoard:
def __init__(self, image: np.ndarray, boards: List[Board]):
self.image = image
self.boards = boards
def save(self, path: str):
file_out = ZipFile(path, "w")
path_out_img = "/tmp/{}.png".format(random_name())
plt.imsave(path_out_img, self.image)
file_out_image = file_out.open("image.png", "w")
file_out_image.write(open(path_out_img, "rb").read())
file_out_image.close()
os.remove(path_out_img)
json_str = json.dumps([dict(x=b.x, y=b.y, w=b.w, h=b.h) for b in self.boards], ensure_ascii=False)
json_file = file_out.open("boards.json", "w")
json_file.write(json_str.encode())
json_file.close()
for i, contour in enumerate(self.boards):
file_out_image = file_out.open("image_{}.png".format(i), "w")
x, y, w, h = contour.x, contour.y, contour.w, contour.h
image = np.copy(self.image)
tmp_name = "/tmp/image_{}_{}.png".format(random_name(), i)
plt.imsave(tmp_name, cv2.rectangle(image, (x, y), (x + w, y + h), (0, 255, 0), 2))
file_out_image.write(open(tmp_name, "rb").read())
file_out_image.close()
os.remove(tmp_name)
file_out.close()
def filter_contour(contours, hierarchy):
return [c for c, h in zip(contours, hierarchy[0]) if h[-1] == 0]
def get_contour_image(image_black: np.ndarray, image_red: np.ndarray) -> ImageWithBoard:
image = np.copy(image_black)
image_diff = image_red - image_black
mask = (image_diff.sum(axis=2) > 0)
boards_image = np.zeros(image_diff.shape, dtype=image_black.dtype) + 255
boards_image[mask] = 0
boards_image = cv2.cvtColor(boards_image, cv2.COLOR_BGR2GRAY)
image[mask] = 255
contours, hierarchy = cv2.findContours(boards_image, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
contours = filter_contour(contours, hierarchy)
boards_list = []
plt.imsave("/tmp/board_{}.png".format(random_name()), boards_image)
for contour in contours:
(x, y, w, h) = cv2.boundingRect(contour)
board = Board(x, y, w, h)
boards_list.append(board)
return ImageWithBoard(image, boards_list)
def get_images(path: str) -> List[ImageWithBoard]:
path_red, path_black = modify(path, "/tmp/ex_docx_{}.docx")
images_red = map(np.asarray, convert_from_bytes(open(path_red, 'rb').read()))
images_black = map(np.asarray, convert_from_bytes(open(path_black, 'rb').read()))
res = []
for image_red, image_black in zip(images_red, images_black):
res.append(get_contour_image(image_black, image_red))
os.remove(path_red)
os.remove(path_black)
return res
if __name__ == '__main__':
for i, image in enumerate(get_images("/home/padre/rojects/docx-parser/data/ОписаниеПропуск.docx")):
image.save("/tmp/image_{}.zip".format(i))
<w:pPr><w:pBdr><w:top w:val="single" w:sz="2" w:space="1" w:color="ED1C24"/><w:left w:val="single" w:sz="2" w:space="1" w:color="ED1C24"/><w:bottom w:val="single" w:sz="2" w:space="1" w:color="ED1C24"/><w:right w:val="single" w:sz="2" w:space="1" w:color="ED1C24"/></w:pBdr>
\ No newline at end of file
import cv2
import numpy as np
import unittest
import matplotlib.pyplot as plt
from modify_docx import get_contour_image
plt.figure(figsize=(20, 20))
class TestImages(unittest.TestCase):
def test_contours(self):
image_red = cv2.imread("data/img_red.png")[:, :, ::-1]
plt.imsave("/tmp/image_{}.png".format("image_red"), image_red)
image_black = cv2.imread("data/img_black.png")[:, :, ::-1]
plt.imsave("/tmp/image_{}.png".format("image_red"), image_red)
image_with_contours = get_contour_image(image_black, image_red)
plt.imsave("/tmp/image_{}.png".format("image_with_contours"), image_with_contours.image)
plt.imsave("/tmp/image_.jpg", image_with_contours.image)
for i, contour in enumerate(image_with_contours.boards):
x, y, w, h = contour.x, contour.y, contour.w, contour.h
image = np.copy(image_with_contours.image)
plt.imsave("/tmp/image_{}.png".format(i),
cv2.rectangle(image, (x, y), (x + w, y + h), (0, 255, 0), 2))
self.assertEqual(len(image_with_contours.boards), 10)
......@@ -6,3 +6,5 @@ scikit-image
jupyter
pandas
lxml
opencv-python==4.1.2.30
pdf2image==1.10.0
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment