fix: Carmodoo search encoding issue - Korean car names garbled
lxml was re-encoding already decoded UTF-8 HTML based on charset="euc-kr" meta tag. Fixed by removing charset meta tags and explicitly setting UTF-8 encoding in HTMLParser. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -380,7 +380,14 @@ class CarmodooClient:
|
||||
cars = []
|
||||
|
||||
try:
|
||||
tree = lxml_html.fromstring(html)
|
||||
# HTML 내부의 charset 선언 제거 (이미 UTF-8로 디코딩됨)
|
||||
html = re.sub(r'<meta[^>]*charset[^>]*>', '', html, flags=re.IGNORECASE)
|
||||
html = re.sub(r'charset\s*=\s*["\']?euc-kr["\']?', 'charset="utf-8"', html, flags=re.IGNORECASE)
|
||||
|
||||
# lxml에 UTF-8 인코딩임을 명시
|
||||
from lxml.html import HTMLParser
|
||||
parser = HTMLParser(encoding='utf-8')
|
||||
tree = lxml_html.document_fromstring(html.encode('utf-8'), parser=parser)
|
||||
|
||||
# 각 차량 행 찾기 (tr id="trCtl_XXXXXXX")
|
||||
car_rows = tree.xpath('//tr[starts-with(@id, "trCtl_")]')
|
||||
|
||||
Reference in New Issue
Block a user