英文:
how to convert two/there column images to text with ( tesseract.js ocr)?
问题
我正在进行一个React.js项目,几乎已经完成了,但我的问题是,如果我想使用Tesseract(OCR)将两列/三列图像转换为文本,它不会按我想要的方式进行转换。因为两列的文本混在一起了,无法按列单独转换。有没有可能以某种方式解决这个问题?
我尝试过使用opencv.js,但无法解决它。
英文:
I am working on a react.js project, I have almost done but my problem is if I want to convert two/three column images to text by Tesseract (OCR) does not convert as I want. because two columns' text is mixed. no separately convert by column. can possibel to solve this problem anyway?
import React, { useState, useEffect } from "react";
import Tesseract from "tesseract.js";
import ClipboardJS from "clipboard";
import Select from "react-select";
const languageOptions = [
{ value: "afr", label: "Afrikaans" },
{ value: "amh", label: "Amharic" },
{ value: "ara", label: "Arabic" },
{ value: "asm", label: "Assamese" },
{ value: "aze", label: "Azerbaijani" },
{ value: "aze_cyrl", label: "Azerbaijani - Cyrillic" },
{ value: "bel", label: "Belarusian" },
{ value: "ben", label: "Bengali" },
{ value: "bod", label: "Tibetan" },
{ value: "bos", label: "Bosnian" },
{ value: "bul", label: "Bulgarian" },
{ value: "cat", label: "Catalan; Valencian" },
{ value: "ceb", label: "Cebuano" },
{ value: "ces", label: "Czech" },
{ value: "chi_sim", label: "Chinese - Simplified" },
{ value: "chi_tra", label: "Chinese - Traditional" },
{ value: "chr", label: "Cherokee" },
{ value: "cym", label: "Welsh" },
{ value: "dan", label: "Danish" },
{ value: "deu", label: "German" },
{ value: "dzo", label: "Dzongkha" },
{ value: "ell", label: "Greek, Modern (1453-)" },
{ value: "eng", label: "English" },
{ value: "enm", label: "English, Middle (1100-1500)" },
{ value: "epo", label: "Esperanto" },
{ value: "est", label: "Estonian" },
{ value: "eus", label: "Basque" },
{ value: "fas", label: "Persian" },
{ value: "fin", label: "Finnish" },
{ value: "fra", label: "French" },
{ value: "frk", label: "German Fraktur" },
{ value: "frm", label: "French, Middle (ca. 1400-1600)" },
{ value: "gle", label: "Irish" },
{ value: "glg", label: "Galician" },
{ value: "grc", label: "Greek, Ancient (-1453)" },
{ value: "guj", label: "Gujarati" },
{ value: "hat", label: "Haitian; Haitian Creole" },
{ value: "heb", label: "Hebrew" },
{ value: "hin", label: "Hindi" },
{ value: "hrv", label: "Croatian" },
{ value: "hun", label: "Hungarian" },
{ value: "iku", label: "Inuktitut" },
{ value: "ind", label: "Indonesian" },
{ value: "isl", label: "Icelandic" },
{ value: "ita", label: "Italian" },
{ value: "ita_old", label: "Italian - Old" },
{ value: "jav", label: "Javanese" },
{ value: "jpn", label: "Japanese" },
{ value: "kan", label: "Kannada" },
{ value: "kat", label: "Georgian" },
{ value: "kat_old", label: "Georgian - Old" },
{ value: "kaz", label: "Kazakh" },
{ value: "khm", label: "Central Khmer" },
{ value: "kir", label: "Kirghiz; Kyrgyz" },
{ value: "kor", label: "Korean" },
{ value: "kur", label: "Kurdish" },
{ value: "lao", label: "Lao" },
{ value: "lat", label: "Latin" },
{ value: "lav", label: "Latvian" },
{ value: "lit", label: "Lithuanian" },
{ value: "mal", label: "Malayalam" },
{ value: "mar", label: "Marathi" },
{ value: "mkd", label: "Macedonian" },
{ value: "mlt", label: "Maltese" },
{ value: "msa", label: "Malay" },
{ value: "mya", label: "Burmese" },
{ value: "nep", label: "Nepali" },
{ value: "nld", label: "Dutch; Flemish" },
{ value: "nor", label: "Norwegian" },
{ value: "ori", label: "Oriya" },
{ value: "pan", label: "Panjabi; Punjabi" },
{ value: "pol", label: "Polish" },
{ value: "por", label: "Portuguese" },
{ value: "pus", label: "Pushto; Pashto" },
{ value: "ron", label: "Romanian; Moldavian; Moldovan" },
{ value: "rus", label: "Russian" },
{ value: "san", label: "Sanskrit" },
{ value: "sin", label: "Sinhala; Sinhalese" },
{ value: "slk", label: "Slovak" },
{ value: "slv", label: "Slovenian" },
{ value: "spa", label: "Spanish; Castilian" },
{ value: "spa_old", label: "Spanish; Castilian - Old" },
{ value: "sqi", label: "Albanian" },
{ value: "srp", label: "Serbian" },
{ value: "srp_latn", label: "Serbian - Latin" },
{ value: "swa", label: "Swahili" },
{ value: "swe", label: "Swedish" },
{ value: "syr", label: "Syriac" },
{ value: "tam", label: "Tamil" },
{ value: "tel", label: "Telugu" },
{ value: "tgk", label: "Tajik" },
{ value: "tgl", label: "Tagalog" },
{ value: "tha", label: "Thai" },
{ value: "tir", label: "Tigrinya" },
{ value: "tur", label: "Turkish" },
{ value: "uig", label: "Uighur; Uyghur" },
{ value: "ukr", label: "Ukrainian" },
{ value: "urd", label: "Urdu" },
{ value: "uzb", label: "Uzbek" },
{ value: "uzb_cyrl", label: "Uzbek - Cyrillic" },
{ value: "vie", label: "Vietnamese" },
{ value: "yid", label: "Yiddish" }
];
const ImagesToText = () => {
const [isLoading, setIsLoading] = useState(false);
const [images, setImages] = useState([]);
const [texts, setTexts] = useState([]);
const [progress, setProgress] = useState(0);
const [currentImageIndex, setCurrentImageIndex] = useState(0);
const [errorMessage, setErrorMessage] = useState("");
const [errorLanguagesMessage, setErrorLanguagesMessage] = useState("");
const [selectedLanguages, setSelectedLanguages] = useState([]);
const handleImageUpload = (e) => {
const selectedImages = Array.from(e.target.files);
setImages(selectedImages);
setErrorMessage("");
};
const handleCopyText = () => {
const textWithSoftLineBreaks = texts.join("\n");
navigator.clipboard.writeText(textWithSoftLineBreaks);
};
const handleDownloadText = () => {
const element = document.createElement("a");
const textBlob = new Blob([texts.join("\n")], { type: "text/plain" });
element.href = URL.createObjectURL(textBlob);
element.download = "converted_text.txt";
document.body.appendChild(element);
element.click();
document.body.removeChild(element);
};
useEffect(() => {
const clipboard = new ClipboardJS(".copy-button");
clipboard.on("success", (e) => {
e.clearSelection();
});
return () => {
clipboard.destroy();
};
}, [texts]);
const handleReset = () => {
setIsLoading(false);
setImages([]);
setTexts([]);
setProgress(0);
setCurrentImageIndex(0);
setErrorMessage("");
setErrorLanguagesMessage("");
window.location.reload();
};
const handleSubmit = async () => {
if (images.length === 0) {
setErrorMessage("Select an image to convert.");
return;
}
if (selectedLanguages.length === 0) {
setErrorLanguagesMessage("Select any language.");
return;
}
setIsLoading(true);
setProgress(0);
setTexts([]);
setCurrentImageIndex(0);
setErrorMessage("");
setErrorLanguagesMessage("");
const totalImages = images.length;
let processedImages = 0;
if (Array.isArray(images)) {
for (const [index, image] of images?.entries()) {
setCurrentImageIndex(index + 1);
try {
const result = await Tesseract.recognize(
image,
selectedLanguages.map((lang) => lang.value).join("+")
);
const paragraphs = result.data.text.split("\n\n");
const formattedParagraphs = paragraphs.map((paragraph) => {
const sentences = paragraph.split(/[.|?]\s/);
return sentences.join(" ");
});
setTexts((prevTexts) => [...prevTexts, ...formattedParagraphs]);
} catch (err) {
console.error(err);
// Clear texts and stop conversion process immediately on error
setTexts([]);
setProgress(0);
setIsLoading(false);
return;
} finally {
processedImages++;
const currentProgress = (processedImages / totalImages) * 100;
setProgress(currentProgress);
}
}
} else {
console.error("Images is not an array.");
}
setIsLoading(false);
};
return (
<div className="container" style={{ height: "97vh" }}>
<div className="row h-100 mt-3">
<div className="col-md-3 left-bar sticky-top border 1 ms-2">
<h1 className="center py-3 mc-5 underline">Images to text (ocr)</h1>
<input
type="file"
onChange={handleImageUpload}
className="form-control mt-5 mb-2"
multiple
accept="image/*"
/>
{errorMessage && <p className="text-danger">{errorMessage}</p>}
<Select
isMulti
options={languageOptions}
value={selectedLanguages}
onChange={setSelectedLanguages}
placeholder="Select languages..."
/>
{errorLanguagesMessage && (
<p className="text-danger">{errorLanguagesMessage}</p>
)}
<input
type="button"
onClick={handleSubmit}
className="btn btn-outline-success mt-3"
value="Start Convert"
/>
{texts.length > 0 && (
<button
className="btn btn-primary mt-3 ms-1"
onClick={handleDownloadText}
>
Download Text
</button>
)}
<div className="mt-1">
<button className=" btn ml-2 btn-danger" onClick={handleReset}>
Reset
</button>
<button
className="mt-3 btn btn-secondary d-inline ms-1 "
onClick={handleCopyText}
>
Copy Text
</button>
</div>
</div>
<div className="col-md-8 right-bar border 1 ms-2">
<h4 className="mt-5 text-center">Select an Image to convert (ocr)</h4>
{isLoading && (
<div className="text-center">
<div className="text-center">
<progress
className="custom-progress-bar"
value={progress}
max="100"
></progress>
<p className="text-center py-0 my-0">
Converting...: {progress.toFixed(0)}% ({currentImageIndex} of{" "}
{images.length})
</p>
</div>
</div>
)}
{!isLoading && texts.length > 0 && (
<div>
<div className="form-control box-p w-100 mt-5 m-none">
{texts.map((paragraph, index) => (
<p key={index}>{paragraph}</p>
))}
</div>
</div>
)}
</div>
</div>
</div>
);
};
export default ImagesToText;
I tried with opencv.js but I can't solve it.
答案1
得分: 1
尝试使用不同的页面分割模式(PSM)。默认情况下,TesseractJS 使用PSM_SINGLE_BLOCK
,假定文本是作为单一统一块出现的,但这里情况并非如此。我建议尝试使用PSM_AUTO_OSD
,看看你能得到什么结果,并进一步尝试其他PSM。
我在你的文档中使用了PSM_AUTO_OSD
,发现它打印了第一列(左边),然后是第二列(右边)。
英文:
Try utilizing a different Page Segmentation Mode (PSM). By default, TesseractJS utilizes PSM_SINGLE_BLOCK
, which assumes that the text is coming as a single uniform block, which is not the case here. I'd recommend trying PSM_AUTO_OSD
and see what results you get, and experiment further with the other PSMs.
I utilized PSM_AUTO_OSD
with your document and found that it printed column 1 (left), followed by column 2 (right).
通过集体智慧和协作来改善编程学习和解决问题的方式。致力于成为全球开发者共同参与的知识库,让每个人都能够通过互相帮助和分享经验来进步。
评论