keesephillips
/

table_extraction

Safetensors

Model card Files Files and versions

xet

Community

keesephillips commited on Aug 7, 2024

Commit

9b09eca

verified ·

1 Parent(s): 1e8f056

Upload folder using huggingface_hub

Browse files

Files changed (1) hide show

scripts/model.py +6 -39

scripts/model.py CHANGED Viewed

@@ -50,13 +50,9 @@ def improve_ocr_accuracy(img):
         np.ndarray: The preprocessed image as a binary thresholded array.
     """
-    # Read image with PIL (for color preservation)
     img = Image.open(img)
-    # Increase image size (can improve accuracy for small text)
     img = img.resize((img.width * 4, img.height * 4))
-    # Increase contrast
     enhancer = ImageEnhance.Contrast(img)
     img = enhancer.enhance(2)
@@ -78,27 +74,19 @@ def create_ocr_outputs():
     directory_path = os.getcwd() + '/data/processed/hand_labeled_tables/hand_labeled_tables'
     for root, dirs, files in os.walk(directory_path):
-        # Print the current directory
-        print(f"Current directory: {root}")
-        # Print all subdirectories in the current directory
-        print("Subdirectories:")
         for dir in dirs:
             print(f"- {dir}")
-        # Print all files in the current directory
-        print("Files:")
         for image_path in files:
             print(f"- {image_path}")
             full_path = os.path.join(root, image_path)
-            # Preprocess the image
             preprocessed_image = improve_ocr_accuracy(full_path)
             ocr_text = ocr_core(preprocessed_image)
             with open(os.getcwd() + f"/data/processed/annotations/{image_path.split('.')[0]}.txt", 'wb') as f:
                 f.write(ocr_text.encode('utf-8'))
-        print("\n")  # Add a blank line for readability
 def prepare_dataset(ocr_dir, csv_dir, output_file):
     """
@@ -143,10 +131,7 @@ def tokenize_function(examples):
         dict: A dictionary containing tokenized inputs and labels.
     """
-    # Tokenize the inputs
     inputs = tokenizer(examples['prompt'], truncation=True, padding='max_length', max_length=1012)
-    # Create labels which are the same as input_ids
     inputs['labels'] = inputs['input_ids'].copy()
     return inputs
@@ -172,29 +157,23 @@ def calculate_metrics(model, tokenizer, texts, labels):
     with torch.no_grad():
         for text, label in zip(texts, labels):
-            # Tokenize input and label
             input_ids = tokenizer.encode(text, return_tensors="pt")
             label_ids = tokenizer.encode(label, return_tensors="pt")[0]
-            # Generate prediction
             output = model.generate(input_ids, max_length=input_ids.shape[1] + len(label_ids), num_return_sequences=1)
             predicted_ids = output[0][input_ids.shape[1]:]
-            # Convert ids to tokens
             predicted_tokens = tokenizer.convert_ids_to_tokens(predicted_ids)
             label_tokens = tokenizer.convert_ids_to_tokens(label_ids)
-            # Extend predictions and labels
             all_predictions.extend(predicted_tokens)
             all_labels.extend(label_tokens)
-            # Calculate loss
             outputs = model(input_ids=input_ids, labels=label_ids.unsqueeze(0))
             loss = outputs.loss
             total_loss += loss.item() * len(label_ids)
             total_tokens += len(label_ids)
-    # Calculate metrics
     precision = precision_score(all_labels, all_predictions, average='weighted', zero_division=0)
     recall = recall_score(all_labels, all_predictions, average='weighted', zero_division=0)
     f1 = f1_score(all_labels, all_predictions, average='weighted', zero_division=0)
@@ -211,10 +190,8 @@ if __name__ == '__main__':
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
     print(f"Using device: {device}")
-    # Load a pretrained YOLOv8 model
     model = YOLO('yolov8l.pt')
-    # Train the model on your custom dataset
     results = model.train(
         data='config.yaml',
         epochs=1,
@@ -232,15 +209,11 @@ if __name__ == '__main__':
     model.save(os.getcwd() + '/models/trained_yolov8.pt')
     create_ocr_outputs()
-    # Usage
     ocr_dir = os.getcwd() + '/data/processed/annotations'
     csv_dir = os.getcwd() + '/data/processed/hand_labeled_tables'
     output_file = 'dataset.jsonl'
     prepare_dataset(ocr_dir, csv_dir, output_file)
-    # Load the dataset
     dataset = load_dataset('json', data_files={'train': 'dataset.jsonl'})
     dataset = dataset['train'].train_test_split(test_size=0.1)
@@ -262,13 +235,12 @@ if __name__ == '__main__':
         weight_decay=0.01,
         logging_dir='./logs',
         logging_steps=10,
-        evaluation_strategy="epoch",  # Evaluate at the end of each epoch
-        save_strategy="epoch",  # Save at the end of each epoch
-        load_best_model_at_end=True,  # Load the best model when finished training (based on evaluation)
-        metric_for_best_model="eval_loss",  # Use eval_loss to determine the best model
     )
-    # Trainer
     trainer = Trainer(
         model=model,
         args=training_args,
@@ -276,21 +248,16 @@ if __name__ == '__main__':
         eval_dataset=tokenized_dataset['test'],
     )
-    # Train the model
     trainer.train()
-    # Evaluate the model
     eval_results = trainer.evaluate()
     print(f"Evaluation results: {eval_results}")
-    # Save the model
     gpt_model.save_pretrained(os.getcwd() + '/models/gpt')
     tokenizer.save_pretrained(os.getcwd() + '/models/gpt')
-    # Calculate metrics
     precision, recall, f1 = calculate_metrics(gpt_model, tokenizer, dataset['test']['text'], dataset['test']['label'])
-    # Display metrics
     print(f"Precision: {precision:.4f}")
     print(f"Recall: {recall:.4f}")
     print(f"F1 Score: {f1:.4f}")

         np.ndarray: The preprocessed image as a binary thresholded array.
     """
     img = Image.open(img)
     img = img.resize((img.width * 4, img.height * 4))
     enhancer = ImageEnhance.Contrast(img)
     img = enhancer.enhance(2)
     directory_path = os.getcwd() + '/data/processed/hand_labeled_tables/hand_labeled_tables'
     for root, dirs, files in os.walk(directory_path):
         for dir in dirs:
             print(f"- {dir}")
         for image_path in files:
             print(f"- {image_path}")
             full_path = os.path.join(root, image_path)
             preprocessed_image = improve_ocr_accuracy(full_path)
             ocr_text = ocr_core(preprocessed_image)
             with open(os.getcwd() + f"/data/processed/annotations/{image_path.split('.')[0]}.txt", 'wb') as f:
                 f.write(ocr_text.encode('utf-8'))
+        print("\n")
 def prepare_dataset(ocr_dir, csv_dir, output_file):
     """
         dict: A dictionary containing tokenized inputs and labels.
     """
     inputs = tokenizer(examples['prompt'], truncation=True, padding='max_length', max_length=1012)
     inputs['labels'] = inputs['input_ids'].copy()
     return inputs
     with torch.no_grad():
         for text, label in zip(texts, labels):
             input_ids = tokenizer.encode(text, return_tensors="pt")
             label_ids = tokenizer.encode(label, return_tensors="pt")[0]
             output = model.generate(input_ids, max_length=input_ids.shape[1] + len(label_ids), num_return_sequences=1)
             predicted_ids = output[0][input_ids.shape[1]:]
             predicted_tokens = tokenizer.convert_ids_to_tokens(predicted_ids)
             label_tokens = tokenizer.convert_ids_to_tokens(label_ids)
             all_predictions.extend(predicted_tokens)
             all_labels.extend(label_tokens)
             outputs = model(input_ids=input_ids, labels=label_ids.unsqueeze(0))
             loss = outputs.loss
             total_loss += loss.item() * len(label_ids)
             total_tokens += len(label_ids)
     precision = precision_score(all_labels, all_predictions, average='weighted', zero_division=0)
     recall = recall_score(all_labels, all_predictions, average='weighted', zero_division=0)
     f1 = f1_score(all_labels, all_predictions, average='weighted', zero_division=0)
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
     print(f"Using device: {device}")
     model = YOLO('yolov8l.pt')
     results = model.train(
         data='config.yaml',
         epochs=1,
     model.save(os.getcwd() + '/models/trained_yolov8.pt')
     create_ocr_outputs()
     ocr_dir = os.getcwd() + '/data/processed/annotations'
     csv_dir = os.getcwd() + '/data/processed/hand_labeled_tables'
     output_file = 'dataset.jsonl'
     prepare_dataset(ocr_dir, csv_dir, output_file)
     dataset = load_dataset('json', data_files={'train': 'dataset.jsonl'})
     dataset = dataset['train'].train_test_split(test_size=0.1)
         weight_decay=0.01,
         logging_dir='./logs',
         logging_steps=10,
+        evaluation_strategy="epoch",
+        save_strategy="epoch",
+        load_best_model_at_end=True,
+        metric_for_best_model="eval_loss",
     )
     trainer = Trainer(
         model=model,
         args=training_args,
         eval_dataset=tokenized_dataset['test'],
     )
     trainer.train()
     eval_results = trainer.evaluate()
     print(f"Evaluation results: {eval_results}")
     gpt_model.save_pretrained(os.getcwd() + '/models/gpt')
     tokenizer.save_pretrained(os.getcwd() + '/models/gpt')
     precision, recall, f1 = calculate_metrics(gpt_model, tokenizer, dataset['test']['text'], dataset['test']['label'])
     print(f"Precision: {precision:.4f}")
     print(f"Recall: {recall:.4f}")
     print(f"F1 Score: {f1:.4f}")