Commit 21c71e34 authored by Oksana Belyaeva's avatar Oksana Belyaeva
Browse files

added Converter

parent 955339d7
......@@ -7,17 +7,8 @@ from PIL import Image
from object_detection.utils import dataset_util
from collections import namedtuple, OrderedDict
# Add more class labels as needed, make sure to start at 1
def class_text_to_int(row_label: str) -> int:
if row_label == 'text':
return 1
if row_label == 'table':
return 2
if row_label == 'figure':
return 3
else:
print("unknown type")
None
from utils_convert_draw_format.ConverterCLAW import ConverterCLAW
def split(df, group):
data = namedtuple('data', ['filename', 'object'])
......@@ -49,7 +40,7 @@ def create_tf_example(group, path):
ymins.append(max(0.0, row['ymin'] / height))
ymaxs.append(min(1.0, row['ymax'] / height))
classes_text.append('text'.encode('utf8') if row['class'] == 'list' else row['class'].encode('utf8')) # TODO for 4 class
classes.append(class_text_to_int(row['class']))
classes.append(ConverterCLAW.class_text_to_int(row['class']))
tf_example = tf.train.Example(features=tf.train.Features(feature={
'image/height': dataset_util.int64_feature(height),
......
......@@ -7,22 +7,8 @@ from PIL import Image
from object_detection.utils import dataset_util
from collections import namedtuple, OrderedDict
# Add more class labels as needed, make sure to start at 1
def class_text_to_int(row_label: str) -> int:
if row_label == 'text':
return 1
if row_label == 'list': # TODO for 4 class
return 1
if row_label == 'title':
return 2
if row_label == 'table':
return 3
if row_label == 'figure':
return 4
else:
None
from utils_convert_draw_format.ConverterPubLayNet import ConverterPubLayNet
def split(df, group):
data = namedtuple('data', ['filename', 'object'])
......@@ -52,7 +38,7 @@ def create_tf_example(group, path):
ymins.append(max(0.0, row['ymin'] / height))
ymaxs.append(min(1.0, row['ymax'] / height))
classes_text.append('text'.encode('utf8') if row['class'] == 'list' else row['class'].encode('utf8')) # TODO for 4 class
classes.append(class_text_to_int(row['class']))
classes.append(ConverterPubLayNet.class_text_to_int(row['class']))
tf_example = tf.train.Example(features=tf.train.Features(feature={
'image/height': dataset_util.int64_feature(height),
......
import datetime
from utils_convert_draw_format.Converter import ConverterAbstract
import os
class ConverterCLAW(ConverterAbstract):
def __init__(self):
self.PROJ_DIR = os.path.dirname(__file__)
self._RAW_DATA_DIR = "/home/ox/claw_data"
self._WORK_DATA_DIR = self.PROJ_DIR + "/../CLAWDATA"
self._DATASETNAME = "train-0"
@property
def RAW_DATA_DIR(self):
return self._RAW_DATA_DIR
@property
def WORK_DATA_DIR(self):
return self._WORK_DATA_DIR
@property
def DATASETNAME(self):
return self._DATASETNAME
@DATASETNAME.setter
def DATASETNAME(self, name: str):
self._DATASETNAME = name
@staticmethod
def class_text_to_int(row_label: str) -> int:
if row_label == 'text':
return 1
if row_label == 'table':
return 2
if row_label == 'figure':
return 3
else:
None
@ConverterAbstract.CATEGORIES.getter
def CATEGORIES(self):
return [
{
'id': 1,
'name': 'text',
'supercategory': 'shape',
},
{
'id': 2,
'name': 'table',
'supercategory': 'shape',
},
{
'id': 3,
'name': 'figure',
'supercategory': 'shape',
},
]
@ConverterAbstract.INFO.getter
def INFO(self):
return {
"description": "CLAW Dataset",
"url": "",
"version": "0.1.0",
"year": 2019,
"contributor": "belyaeva",
"date_created": datetime.datetime.utcnow().isoformat(' ')
}
@ConverterAbstract.LICENSES.getter
def LICENSES(self):
return [
{
"id": 1,
"name": "Attribution-NonCommercial-ShareAlike License",
"url": "http://creativecommons.org/licenses/by-nc-sa/2.0/"
}
]
\ No newline at end of file
import datetime
from utils_convert_draw_format.Converter import ConverterAbstract
import os
class ConverterPubLayNet(ConverterAbstract):
def __init__(self):
self.PROJ_DIR = os.path.dirname(__file__)
self._RAW_DATA_DIR = "/home/ox/claw_data"
self._WORK_DATA_DIR = self.PROJ_DIR + "/../PubLayNet"
self._DATASETNAME = "train-0"
@DeprecationWarning
@property
def RAW_DATA_DIR(self):
return self._RAW_DATA_DIR
@property
def WORK_DATA_DIR(self):
return self._WORK_DATA_DIR
@DeprecationWarning
@property
def DATASETNAME(self):
return self._DATASETNAME
@DeprecationWarning
@DATASETNAME.setter
def DATASETNAME(self, name: str):
self._DATASETNAME = name
# Add more class labels as needed, make sure to start at 1
@staticmethod
def class_text_to_int(row_label: str) -> int:
if row_label == 'text':
return 1
if row_label == 'list': # TODO for 4 class
return 1
if row_label == 'title':
return 2
if row_label == 'table':
return 3
if row_label == 'figure':
return 4
else:
None
@DeprecationWarning
@ConverterAbstract.CATEGORIES.getter
def CATEGORIES(self):
return []
@DeprecationWarning
@ConverterAbstract.INFO.getter
def INFO(self):
return {}
@DeprecationWarning
@ConverterAbstract.LICENSES.getter
def LICENSES(self):
return []
......@@ -3,7 +3,6 @@ import json
from PIL import Image
import pandas as pd
from config_project import CUR_DATA_DIR
import os
from typing import Dict
from utils_convert_draw_format.ConverterCLAW import ConverterCLAW
......
......@@ -11,54 +11,7 @@ from pycococreatortools import pycococreatortools
import pandas as pd
from collections import namedtuple
from config_project import CUR_DATA_DIR
DATASETNAME = 'train-0'
INFO = {
"description": CUR_DATA_DIR.split("/")[-1] + " Dataset",
"url": "",
"version": "0.1.0",
"year": 2019,
"contributor": "waspinator",
"date_created": datetime.datetime.utcnow().isoformat(' ')
}
LICENSES = [
{
"id": 1,
"name": "Attribution-NonCommercial-ShareAlike License",
"url": "http://creativecommons.org/licenses/by-nc-sa/2.0/"
}
]
CATEGORIES = [
{
'id': 1,
'name': 'text',
'supercategory': 'shape',
},
{
'id': 2,
'name': 'table',
'supercategory': 'shape',
},
{
'id': 3,
'name': 'figure',
'supercategory': 'shape',
},
]
def class_text_to_int(row_label: str) -> int:
if row_label == 'text':
return 1
if row_label == 'table':
return 2
if row_label == 'figure':
return 3
else:
None
from utils_convert_draw_format.Converter import ConverterCLAW, ConverterAbstract
def split(df, group):
data = namedtuple('data', ['filename', 'object'])
......@@ -84,26 +37,26 @@ def filter_for_annotations(root, files, image_filename):
return files
def convert_csv_to_coco():
def convert_csv_to_coco(converter: ConverterAbstract):
coco_output = {
"info": INFO,
"licenses": LICENSES,
"categories": CATEGORIES,
"info": converter.INFO,
"licenses": converter.LICENSES,
"categories": converter.CATEGORIES,
"images": [],
"annotations": []
}
image_id = 1
segmentation_id = 1
ANNOTATION_DIR = os.path.join(CUR_DATA_DIR, "csv")
IMAGE_DIR = os.path.join(CUR_DATA_DIR, "images/" + DATASETNAME)
OUTPUT_DIR = os.path.join(CUR_DATA_DIR, "jsons")
ANNOTATION_DIR = os.path.join(converter.WORK_DATA_DIR, "csv")
IMAGE_DIR = os.path.join(converter.WORK_DATA_DIR, "images/" + converter.DATASETNAME)
OUTPUT_DIR = os.path.join(converter.WORK_DATA_DIR, "jsons")
# filter for jpeg images
for root, _, files in os.walk(IMAGE_DIR):
image_files = filter_for_jpeg(root, files)
examples = pd.read_csv(os.path.join(ANNOTATION_DIR, DATASETNAME + '.csv'))
examples = pd.read_csv(os.path.join(ANNOTATION_DIR, converter.DATASETNAME + '.csv'))
grouped = split(examples, 'filename')
# go through each image
......@@ -116,8 +69,8 @@ def convert_csv_to_coco():
group = [group for group in grouped if group.filename == image_filename.split('/')[-1]][0]
for index, row in group.object.iterrows():
class_id = [x['id'] for x in CATEGORIES if x['name'] in row['class']][0]
category_info = {'id': class_id, 'is_crowd': 0} # нет составных объектов
class_id = [x['id'] for x in converter.CATEGORIES if x['name'] in row['class']][0]
category_info = {'id': class_id, 'is_crowd': 0} # нет составных объектов
# add height, weight
height, width = row['height'], row['width']
......@@ -128,7 +81,7 @@ def convert_csv_to_coco():
binary_mask[ymin:ymax, xmin:xmax] = 255
# create bbox
bbox = [float(xmin), float(ymin), float(xmax-xmin), float(ymax-ymin)] # x, y, width, height
bbox = [float(xmin), float(ymin), float(xmax-xmin), float(ymax-ymin)] # x, y, width, height
# create annotation
annotation_info = pycococreatortools.create_annotation_info(
......@@ -141,9 +94,11 @@ def convert_csv_to_coco():
segmentation_id = segmentation_id + 1
image_id = image_id + 1
with open(os.path.join(OUTPUT_DIR, DATASETNAME + '.json'), 'w') as output_json_file:
with open(os.path.join(OUTPUT_DIR, converter.DATASETNAME + '.json'), 'w') as output_json_file:
json.dump(coco_output, output_json_file)
if __name__ == "__main__":
convert_csv_to_coco()
converter = ConverterCLAW()
converter.DATASETNAME = "train-0"
convert_csv_to_coco(converter)
......@@ -4,6 +4,7 @@ from typing import List
import pandas as pd
from config_project import CUR_DATA_DIR, RAW_DATA_DIR
from utils_convert_draw_format.ConverterCLAW import ConverterCLAW
def parse_xml(file_xml: str) -> List:
......@@ -54,5 +55,7 @@ def create_csv(path_labels: str, out_csv_file: str):
if __name__ == "__main__":
create_csv(os.path.join(RAW_DATA_DIR, "/labels"), os.path.join(CUR_DATA_DIR, "csv/train-0.csv"))
create_csv(os.path.join(RAW_DATA_DIR, "/labels"), os.path.join(CUR_DATA_DIR, "csv/test.csv"))
converter = ConverterCLAW()
create_csv(os.path.join(converter.RAW_DATA_DIR, "/labels"), os.path.join(converter.WORK_DATA_DIR, "csv/train-0.csv"))
create_csv(os.path.join(converter.RAW_DATA_DIR, "/labels"), os.path.join(converter.WORK_DATA_DIR, "csv/test.csv"))
......@@ -13,6 +13,8 @@ from typing import Dict
from config_project import CUR_DATA_DIR
# Define color code
from utils_convert_draw_format.ConverterCLAW import ConverterCLAW
colors = {'title': (255, 0, 0),
'text': (0, 255, 0),
'figure': (0, 0, 255),
......@@ -74,8 +76,7 @@ def markup(image: PIL.Image, annotations: Dict, samples: Dict, font: FreeTypeFon
return np.array(image)
def draw_annotation(json_path: str = 'jsons/train.json', path_images: str = "images/train-0/train/",
path_out: str = "labeled/train-0/"):
def draw_annotation(json_path: str, path_images: str, path_out: str):
with open(json_path, 'r') as fp:
samples = json.load(fp)
......@@ -104,6 +105,7 @@ def draw_annotation(json_path: str = 'jsons/train.json', path_images: str = "ima
if __name__ == "__main__":
draw_annotation(json_path=os.path.join(CUR_DATA_DIR, "jsons/train-0.json"),
path_images=os.path.join(CUR_DATA_DIR, "images/train-0/"),
path_out=os.path.join(CUR_DATA_DIR, "images/train-0-labeled/"))
converter = ConverterCLAW()
draw_annotation(json_path=os.path.join(converter.WORK_DATA_DIR, "jsons/train-0.json"),
path_images=os.path.join(converter.WORK_DATA_DIR, "images/train-0/"),
path_out=os.path.join(converter.WORK_DATA_DIR, "images/train-0-labeled/"))
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment