본문 바로가기

Coding/TIL

TIL | #12 | NAVER CLOVA OCR | 23.12.10.(일)

23.12.10.(SUN).TIL.

NAVER CLOVA OCR

NAVER CLOVA OCR

import requests
import uuid
import time
import base64
import json
from google.colab import drive

drive.mount('/content/drive')

api_url = 'api_url'
secret_key = 'secret_key'
image_file = 'image_file'


with open(image_file, 'rb') as f:
    file_data = f.read()

request_json = {
    'images': [
        {
            'format': 'jpg',
            'name': 'demo',
            'data': base64.b64encode(file_data).decode(),
        },
    ],
    'requestId': str(uuid.uuid4()),
    'version': 'V2',
    'timestamp': int(round(time.time() * 1000)),
}

payload = json.dumps(request_json).encode('UTF-8')
headers = {
    'X-OCR-SECRET': secret_key,
    'Content-Type': 'application/json',
}

response = requests.post(api_url, headers=headers, data=payload)

# 일시 키워드가 포함된 텍스트만 출력
recognized_text = ''
for field in response.json()['images'][0]['fields']:
    if '일시' in field['inferText']:
        recognized_text += field['inferText']

# 일시 정보를 추출
start_time, *end_time = recognized_text.split('~')

# 일시 정보를 정규식으로 검증
import re
if not re.match(r"\d{4}\.\d{2}\.\d{2}", start_time):
    print("Invalid date format")
    exit()

# 일시 정보를 파악
year, month, day = start_time.split('.')[:3]
weekday = start_time.split('.')[-1].split(' ')[0]
start_hour, start_minute = start_time.split('.')[-1].split(' ')[1].split(':')[:2]

# 출력 형식에 맞게 변환
print(f"년 : {year}")
print(f"월 : {month}")
print(f"일 : {day}")
print(f"요일 : {weekday}")
print(f"시작 시간 : {start_hour}:{start_minute}")
print(f"종료 시간 : {end_hour}:{end_minute}")

import requests
import uuid
import time
import base64
import json
from google.colab import drive

drive.mount('/content/drive')

api_url = 'api_url'
secret_key = 'secret_key'
image_file = 'image_file'

with open(image_file, 'rb') as f:
    file_data = f.read()

request_json = {
    'images': [
        {
            'format': 'jpg',
            'name': 'demo',
            'data': base64.b64encode(file_data).decode(),
        },
    ],
    'requestId': str(uuid.uuid4()),
    'version': 'V2',
    'timestamp': int(round(time.time() * 1000)),
}

payload = json.dumps(request_json).encode('UTF-8')
headers = {
    'X-OCR-SECRET': secret_key,
    'Content-Type': 'application/json',
}

response = requests.post(api_url, headers=headers, data=payload)

# 일시 정보를 추출
recognized_text = ''
for field in response.json()['images'][0]['fields']:
    if '일시' in field['inferText']:
        # '일시' 키워드를 찾은 이후의 문자열을 추출
        start_index = field['inferText'].find('일시')
        recognized_text = field['inferText'][start_index:]
        break

# 다음 한글을 찾을 때까지 반복
while True:
    if recognized_text[-1].isalpha():
        break
    recognized_text = recognized_text[:-1]

print(recognized_text)