英文:
YOLO model isn't giving expected result
问题
I used a trained model found in github
model : SVHN
As given in readme.md,
my main.py:
import torch
import PIL.Image as Image
import PIL.ImageEnhance as ImageEnhance
import cv2
import matplotlib.pyplot as plt
import matplotlib.image as img
import numpy as np
from math import *
model = torch.hub.load('icns-distributed-cloud/yolov5-svhn', 'svhn').fuse().eval()
model = model.autoshape()
img1 = Image.open(r"C:\Users\user1\Documents\third.jpg")
# Prediction
h, w = img1.size
pred = []
filter1 = ImageEnhance.Color(img1)
img1 = filter1.enhance(0.0)
img1 = img1.resize((h*10, w*10))
img1.show()
def predict(i, j, h, w):
cropped_img = img1.crop((i, j, i+h, j+w))
f = model(cropped_img)
if f[0] != None and f[0][0][4] <= 0.4:
predict(i, j-w, h, w)
predict(i, j, h, w+w)
predict(i-h, j, h, w)
predict(i, j, h*2, w)
if f[0] != None and f[0][0][4] >= 0.4:
img2 = np.array(cropped_img)
cv2.rectangle(img2, (round(f[0][0][0].item()), round(f[0][0][1].item())),
(round(f[0][0][2].item()), round(f[0][0][3].item())), (255, 100, 0), 6)
plt.imshow(img2)
plt.title('class : '+str(f[0][0][5].item())+" \n confidence : "+str(f[0][0][4].item()))
plt.show()
return f
for i in range(0, h*10, h):
for j in range(0, w*10, w):
f = predict(i, j, h, w)
if f != None:
pred.append(f)
for i in pred:
if i[0] != None:
for x1, y1, x2, y2, conf, clas in i[0]:
if conf > 0.3:
print('box: ({}, {}), ({}, {})'.format(x1, y1, x2, y2))
print('confidence : {}'.format(conf))
print('class: {}'.format(int(clas)))
''''
print((int(x1/10), int(y1/10)), (int(x2/10), int(y2/10)))
cv2.rectangle(img2, (int(x1/10), int(y1/10)), (int(x2/10), int(y2/10)), (255, 0, 0), 3)
plt.imshow(img2)
plt.show()
'''
The image which I used is :
hence, obviously my expectation is 4,8,4,6.
Initially it detected only 4 as the image isn't clear. for this purpose, I converted RBG image to gray scale. resized the image, cropped and then resized.
but now, my output is :
Using cache found in C:\Users\Sanmitha/.cache\torch\hub\icns-distributed-cloud_yolov5-svhn_master
from n params module arguments
0 -1 1 3520 models.common.Focus [3, 32, 3]
1 -1 1 18560 models.common.Conv [32, 64, 3, 2]
2 -1 1 19904 models.common.BottleneckCSP [64, 64, 1]
3 -1 1 73984 models.common.Conv [64, 128, 3, 2]
4 -1 1 161152 models.common.BottleneckCSP [128, 128, 3]
5 -1 1 295424 models.common.Conv [128, 256, 3, 2]
6 -1 1 641792 models.common.BottleneckCSP [256, 256, 3]
7 -1 1 1180672 models.common.Conv [256, 512, 3, 2]
8 -1 1 656896 models.common.SPP [512, 512, [5, 9, 13]]
9 -1 1 1248768 models.common.BottleneckCSP [512, 512, 1, False]
10 -1 1 131584 models.common.Conv [512, 256, 1, 1]
11 -1 1 0 torch.nn.modules.upsampling.Upsample [None, 2, 'nearest']
12 [-1, 6] 1 0 models.common.Concat [1]
13 -1 1 378624 models.common.BottleneckCSP [512, 256, 1, False]
14 -1 1 33024 models.common.Conv [256, 128, 1, 1]
15 -1 1 0 torch.nn.modules.upsampling.Upsample [None, 2, 'nearest']
16 [-1, 4] 1 0 models.common.Concat [1]
17 -1 1 95104 models.common.BottleneckCSP [256, 128, 1, False]
18 -1 1 147712 models.common.Conv [128, 128, 3, 2]
19 [-1, 14] 1 0 models.common.Concat [1]
20 -1 1 313088 models.common.BottleneckCSP [256, 256, 1, False]
21 -1 1 590336 models.common.Conv [256, 256, 3, 2]
22 [-1, 10] 1 0 models.common.Concat [1]
23 -1 1 1248768 models.common.BottleneckCSP [512, 512, 1, False]
24 [17
<details>
<summary>英文:</summary>
I used a trained model found in github
model : [SVHN](https://github.com/icns-distributed-cloud/YOLOv5-SVHN)
As given in readme.md,
my main.py:
import torch
import PIL.Image as Image
import PIL.ImageEnhance as ImageEnhance
import cv2
import matplotlib.pyplot as plt
import matplotlib.image as img
import numpy as np
from math import *
model = torch.hub.load('icns-distributed-cloud/yolov5-svhn', 'svhn').fuse().eval()
model = model.autoshape()
img1= Image.open(r"C:\Users\user1\Documents\third.jpg")
Prediction
h,w=img1.size
pred=[]
filter1=ImageEnhance.Color(img1)
img1=filter1.enhance(0.0)
img1=img1.resize((h10,w10))
img1.show()
def predict(i,j,h,w):
cropped_img = img1.crop((i, j, i+h, j+w))
f=model(cropped_img)
if f[0]!=None and f[0][0][4]<=0.4:
predict(i,j-w,h,w)
predict(i,j,h,w+w)
predict(i-h,j,h,w)
predict(i,j,h*2,w)
if f[0]!=None and f[0][0][4]>=0.4:
img2=np.array(cropped_img)
cv2.rectangle(img2,(round(f[0][0][0].item()),round(f[0][0][1].item())),(round(f[0][0][2].item()),round(f[0][0][3].item())),(255,100,0),6)
plt.imshow(img2)
plt.title('class : '+str(f[0][0][5].item())+" \n confidence : "+str(f[0][0][4].item()))
plt.show()
return f
for i in range(0,h10,h):
for j in range(0,w10,w):
f=predict(i,j,h,w)
if f!=None:
pred.append(f)
for i in pred:
if i[0]!=None:
for x1, y1, x2, y2, conf, clas in i[0]:
if conf>0.3:
print('box: ({}, {}), ({}, {})'.format(x1, y1, x2, y2))
print('confidence : {}'.format(conf))
print('class: {}'.format(int(clas)))
'''
print((int(x1/10),int(y1/10)),(int(x2/10),int(y2/10)))
cv2.rectangle(img2,(int(x1/10),int(y1/10)),(int(x2/10),int(y2/10)),(255,0,0),3)
plt.imshow(img2)
plt.show()
'''
The image which I used is :
[![third.jpg](https://i.stack.imgur.com/IDbCb.jpg)](https://i.stack.imgur.com/IDbCb.jpg)
hence, obviously my expectation is 4,8,4,6.
Initially it detected only 4 as the image isn't clear. for this purpose, I converted RBG image to gray scale. resized the image, cropped and then resized.
but now, my output is :
Using cache found in C:\Users\Sanmitha/.cache\torch\hub\icns-distributed-cloud_yolov5-svhn_master
from n params module arguments
0 -1 1 3520 models.common.Focus [3, 32, 3]
1 -1 1 18560 models.common.Conv [32, 64, 3, 2]
2 -1 1 19904 models.common.BottleneckCSP [64, 64, 1]
3 -1 1 73984 models.common.Conv [64, 128, 3, 2]
4 -1 1 161152 models.common.BottleneckCSP [128, 128, 3]
5 -1 1 295424 models.common.Conv [128, 256, 3, 2]
6 -1 1 641792 models.common.BottleneckCSP [256, 256, 3]
7 -1 1 1180672 models.common.Conv [256, 512, 3, 2]
8 -1 1 656896 models.common.SPP [512, 512, [5, 9, 13]]
9 -1 1 1248768 models.common.BottleneckCSP [512, 512, 1, False]
10 -1 1 131584 models.common.Conv [512, 256, 1, 1]
11 -1 1 0 torch.nn.modules.upsampling.Upsample [None, 2, 'nearest']
12 [-1, 6] 1 0 models.common.Concat [1]
13 -1 1 378624 models.common.BottleneckCSP [512, 256, 1, False]
14 -1 1 33024 models.common.Conv [256, 128, 1, 1]
15 -1 1 0 torch.nn.modules.upsampling.Upsample [None, 2, 'nearest']
16 [-1, 4] 1 0 models.common.Concat [1]
17 -1 1 95104 models.common.BottleneckCSP [256, 128, 1, False]
18 -1 1 147712 models.common.Conv [128, 128, 3, 2]
19 [-1, 14] 1 0 models.common.Concat [1]
20 -1 1 313088 models.common.BottleneckCSP [256, 256, 1, False]
21 -1 1 590336 models.common.Conv [256, 256, 3, 2]
22 [-1, 10] 1 0 models.common.Concat [1]
23 -1 1 1248768 models.common.BottleneckCSP [512, 512, 1, False]
24 [17, 20, 23] 1 43152 models.yolo.Detect [11, [[10, 13, 16, 30, 33, 23], [30, 61, 62, 45, 59, 119], [116, 90, 156, 198, 373, 326]], [128, 256, 512]]
Model Summary: 191 layers, 7.28206e+06 parameters, 7.28206e+06 gradients
Fusing layers...
Model Summary: 140 layers, 7.27349e+06 parameters, 7.27349e+06 gradients
Adding autoShape...
C:\Users\user1\AppData\Local\Programs\Python\Python310\lib\site-packages\torch\functional.py:504: UserWarning: torch.meshgrid: in an upcoming release, it will be required to pass the indexing argument. (Triggered internally at ..\aten\src\ATen\native\TensorShape.cpp:3484.)
return _VF.meshgrid(tensors, **kwargs) # type: ignore[attr-defined]
box: (0.0, 0.19499744474887848), (137.242431640625, 128.22288513183594)
confidence : 0.30603668093681335
class: 1
box: (52.23253631591797, 0.0), (190.0, 70.11492156982422)
confidence : 0.41145217418670654
class: 2
box: (54.806610107421875, 3.3701179027557373), (190.0, 278.511474609375)
confidence : 0.4895976781845093
class: 7
box: (55.85855484008789, 0.3548659086227417), (190.0, 197.8501739501953)
confidence : 0.365851491689682
class: 8
box: (0.0, 0.0), (140.3123779296875, 216.81472778320312)
confidence : 0.3351562023162842
class: 4
box: (53.156036376953125, 0.0), (190.0, 72.2713851928711)
confidence : 0.40216246247291565
class: 7
box: (0.0, 0.0), (130.40765380859375, 213.34080505371094)
confidence : 0.3830447793006897
class: 6
box: (51.61797332763672, 0.0), (190.0, 70.91419982910156)
confidence : 0.31078973412513733
class: 7
The detected images are
[![figure1](https://i.stack.imgur.com/qr5AQ.png)](https://i.stack.imgur.com/qr5AQ.png)
[![figure2](https://i.stack.imgur.com/pJlpd.png)](https://i.stack.imgur.com/pJlpd.png)
[![figure3](https://i.stack.imgur.com/jGiY0.png)](https://i.stack.imgur.com/jGiY0.png)
[![figure4](https://i.stack.imgur.com/oEOhy.png)](https://i.stack.imgur.com/oEOhy.png)
[![figure5](https://i.stack.imgur.com/asRk8.png)](https://i.stack.imgur.com/asRk8.png)
[![figure6](https://i.stack.imgur.com/nmi6H.png)](https://i.stack.imgur.com/nmi6H.png)
[![figure7](https://i.stack.imgur.com/rhrbO.png)](https://i.stack.imgur.com/rhrbO.png)
[![figure8](https://i.stack.imgur.com/gBEjo.png)](https://i.stack.imgur.com/gBEjo.png)
[![figure9](https://i.stack.imgur.com/cAlWp.png)](https://i.stack.imgur.com/cAlWp.png)
[![figure10](https://i.stack.imgur.com/iDhES.png)](https://i.stack.imgur.com/iDhES.png)
[![figure11](https://i.stack.imgur.com/Axuqa.png)](https://i.stack.imgur.com/Axuqa.png)
[![figure12](https://i.stack.imgur.com/dz5IN.png)](https://i.stack.imgur.com/dz5IN.png)
[![figure13](https://i.stack.imgur.com/QDEad.png)](https://i.stack.imgur.com/QDEad.png)
I don't know why this detects other things as digits with good confidence but fails to detect digits with good confidence...
Kindly help me
</details>
# 答案1
**得分**: 2
你可以尝试文本识别,类似于无需训练的OCR。只需运行您的图像并获取输出,然后可能添加一个简单的模糊匹配过滤器。
<details>
<summary>英文:</summary>
You can try out text recognition, something like OCR which needs no training.
Just run through your images and get the o/p and maybe add a simple fuzzy match filter.
[![enter image description here][1]][1]
[1]: https://i.stack.imgur.com/p9FXZ.jpg
</details>
# 答案2
**得分**: 0
首先使用CRAFT:字符区域感知文本检测来识别图像中的文本。然后将裁剪的文本图像传递给OCR,比如Tesseract,以从图像中提取文本。
<details>
<summary>英文:</summary>
First use CRAFT: Character-Region Awareness For Text detection to identify the texts in your image. Then pass the cropped text image to OCR like Tesseract to extract the texts from image.
[![enter image description here][1]][1]
[![enter image description here][2]][2]
[![enter image description here][3]][3]
[1]: https://i.stack.imgur.com/FLGtM.jpg
[2]: https://i.stack.imgur.com/XAUz7.jpg
[3]: https://i.stack.imgur.com/QN7wk.jpg
</details>
# 答案3
**得分**: 0
你可以更新这段代码,它应该适用于你:
```python
import torch
import numpy as np
from PIL import Image, ImageDraw
# 下载并加载YOLOv5 SVHN模型
model = torch.hub.load('icns-distributed-cloud/yolov5-svhn', 'svhn').fuse().eval()
model = model.autoshape()
# 预处理输入图像
def preprocess_image(image_path):
image = Image.open(image_path)
image = image.resize((640, 640)) # 调整大小以适应模型的输入尺寸
image = torch.from_numpy(np.array(image)).float() / 255.0 # 将像素值归一化为0到1之间
image = image.permute(2, 0, 1).unsqueeze(0) # 添加批处理维度并重新排列通道
return image
# 执行推理
def recognize_numbers(image_path):
image = preprocess_image(image_path)
with torch.no_grad():
prediction = model(image, size=640)
return prediction
# 在图像副本上绘制边界框
def draw_box(image_path, box, output_path):
image = Image.open(image_path)
original_width, original_height = image.size
print(f"图像大小: {original_width}x{original_height}")
draw = ImageDraw.Draw(image)
x1, y1, x2, y2 = box
x1 = int(x1 * original_width / 640)
y1 = int(y1 * original_height / 640)
x2 = int(x2 * original_width / 640)
y2 = int(y2 * original_height / 640)
print(f"新的可绘制边界框 ({x1},{y1},{x2},{y2})")
draw.rectangle((x1, y1, x2, y2), outline='green', width=3)
image.save(output_path)
# 提供图像路径
image_path = '/tmp/IDbCb.jpg'
# 调用函数识别图像中的数字
prediction = recognize_numbers(image_path)
# 找到置信度最高的边界框
highest_confidence = 0
best_box = None
best_class = 0
for pred in prediction[0][0]:
x1, y1, x2, y2, conf, cls = pred[:6].tolist()
if conf > highest_confidence:
highest_confidence = conf
best_box = (int(x1), int(y1), int(x2), int(y2))
best_class = cls
if best_box:
x1, y1, x2, y2 = best_box
print('边界框: ({}, {}), ({}, {})'.format(x1, y1, x2, y2))
print('类别: {}'.format(int(best_class)))
print('最高置信度: {}'.format(highest_confidence))
else:
print('未找到置信度超过阈值的边界框。')
output_path = '/tmp/image-1.png' # 根据需要更改输出路径和文件名
draw_box(image_path, best_box, output_path)
英文:
you can update this code, it should work for you
import torch
import numpy as np
from PIL import Image, ImageDraw
# Download and load the YOLOv5 SVHN model
model = torch.hub.load('icns-distributed-cloud/yolov5-svhn', 'svhn').fuse().eval()
model = model.autoshape()
# Preprocess the input image
def preprocess_image(image_path):
image = Image.open(image_path)
image = image.resize((640, 640)) # Resize to the model's input size
image = torch.from_numpy(np.array(image)).float() / 255.0 # Normalize pixel values between 0 and 1
image = image.permute(2, 0, 1).unsqueeze(0) # Add batch dimension and rearrange channels
return image
# Perform inference
def recognize_numbers(image_path):
image = preprocess_image(image_path)
with torch.no_grad():
prediction = model(image, size=640)
return prediction
# Draw bounding box on image copy
def draw_box(image_path, box, output_path):
image = Image.open(image_path)
original_width, original_height = image.size
print(f"image size: {original_width}x{original_height}")
draw = ImageDraw.Draw(image)
x1, y1, x2, y2 = box
x1 = int(x1 * original_width / 640)
y1 = int(y1 * original_height / 640)
x2 = int(x2 * original_width / 640)
y2 = int(y2 * original_height / 640)
print(f"new drawable box ({x1},{y1},{x2},{y2})")
draw.rectangle((x1, y1, x2, y2), outline='green', width=3)
image.save(output_path)
# Provide the path to your image
image_path = '/tmp/IDbCb.jpg'
# Call the function to recognize the numbers in the image
prediction = recognize_numbers(image_path)
# Find the box with the highest confidence
highest_confidence = 0
best_box = None
best_class = 0
for pred in prediction[0][0]:
x1, y1, x2, y2, conf, cls = pred[:6].tolist()
if conf > highest_confidence:
highest_confidence = conf
best_box = (int(x1), int(y1), int(x2), int(y2))
best_class = cls
if best_box:
x1, y1, x2, y2 = best_box
print('Box: ({}, {}), ({}, {})'.format(x1, y1, x2, y2))
print('Class: {}'.format(int(best_class)))
print('Highest Confidence: {}'.format(highest_confidence))
else:
print('No box found with confidence above the threshold.')
output_path = '/tmp/image-1.png' # Change the output path and file name as per your requirement
draw_box(image_path, best_box, output_path)
通过集体智慧和协作来改善编程学习和解决问题的方式。致力于成为全球开发者共同参与的知识库,让每个人都能够通过互相帮助和分享经验来进步。
评论