Skip to content

Instantly share code, notes, and snippets.

@gabonator
Created August 4, 2024 22:06
Show Gist options
  • Save gabonator/f8eb7706053d89f1da10ca13b8ee9707 to your computer and use it in GitHub Desktop.
Save gabonator/f8eb7706053d89f1da10ca13b8ee9707 to your computer and use it in GitHub Desktop.
OCR with OSX Vision
// OCR code for processing photos of data structured in tables without installation of any package
// use: swift recognize.swift photo.jpeg
// It should return the 2D structure as JSON
import Cocoa
import Vision
struct TableCell {
var x: Int
var y: Int
var width: Int
var height: Int
var text: String
}
func recognizeText(in image: NSImage) -> [TableCell] {
guard let cgImage = image.cgImage(forProposedRect: nil, context: nil, hints: nil) else {
print("Could not get CGImage")
return []
}
var items: [TableCell] = []
// Create a request handler with the CGImage
let requestHandler = VNImageRequestHandler(cgImage: cgImage, options: [:])
// Create a text recognition request
let request = VNRecognizeTextRequest { request, error in
if let error = error {
print("Text recognition error: \(error)")
return
}
// Process the results
guard let results = request.results as? [VNRecognizedTextObservation] else {
print("No text detected")
return
}
for (_, observation) in results.enumerated() {
if let topCandidate = observation.topCandidates(1).first {
//print("Text \(index + 1): \(topCandidate.string) [\(Int(observation.boundingBox.origin.x*100)) \(Int(100-observation.boundingBox.origin.y*100))]")
items.append(TableCell(
x:Int(observation.boundingBox.origin.x*1000),
y:Int(1000-observation.boundingBox.origin.y*1000),
width:Int(observation.boundingBox.size.width*1000),
height:Int(observation.boundingBox.size.height*1000),
text: topCandidate.string))
}
}
}
// Specify recognition level and language
request.recognitionLevel = .accurate
request.recognitionLanguages = ["sk-SK"] //["en-US"]
do {
// Perform the request
try requestHandler.perform([request])
} catch {
print("Failed to perform request: \(error)")
return []
}
return items
}
func intersects(a: (Int, Int), b: (Int, Int)) -> Bool {
return !(a.1 < b.0 || a.0 > b.1)
}
func groupCoordinatesByProximity(coordinates: [(Int, Int)]) -> [(Int, Int)] {
let sortedCoordinates = coordinates.sorted {$0.0 < $1.0}
var bins: [(Int, Int)] = []
for coordinate in sortedCoordinates {
var found = false
for var bin in bins {
if intersects(a:bin, b:coordinate) {
bin.0 = min(bin.0, coordinate.0)
bin.1 = max(bin.1, coordinate.1)
found = true
break
}
}
if !found {
bins.append(coordinate)
}
}
return bins
}
func findCoord(rng: (Int, Int), arr: [(Int, Int)]) -> Int {
for (index, cset) in arr.enumerated() {
if intersects(a:rng, b:cset) {
return index
}
}
return -1
}
// Load an image
if let image = NSImage(contentsOfFile: CommandLine.arguments[1]) {
var cells = recognizeText(in: image)
cells.sort(by:{$0.y < $1.y})
//for c in cells {
// print("\(c.x) \(c.y) \(c.text)")
//}
let trimy = cells.first(where:{ $0.text == "objektu" })?.y ?? -10
cells = cells.filter{ $0.y >= trimy+10 }
let xs = cells.map { ($0.x, $0.x+$0.width) }
let xsg = groupCoordinatesByProximity(coordinates:xs)
let ys = cells.map { ($0.y, $0.y+$0.height) }
let ysg = groupCoordinatesByProximity(coordinates:ys)
let columns = xsg.count
let rows = ysg.count
var table: [[String]] = Array(repeating: Array(repeating: "", count: columns), count: rows)
for cell in cells {
table[findCoord(rng:(cell.y, cell.y+cell.height), arr:ysg)][findCoord(rng:(cell.x, cell.x+cell.width), arr:xsg)] += cell.text
}
print(table)
} else {
print("Could not load image")
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment