2023年7月10日 19:26:06go评论96阅读模式

英文:

YOLO model isn't giving expected result

问题

I used a trained model found in github

model : SVHN

As given in readme.md,

my main.py:

import torch
import PIL.Image as Image
import PIL.ImageEnhance as ImageEnhance
import cv2
import matplotlib.pyplot as plt
import matplotlib.image as img
import numpy as np
from math import *
model = torch.hub.load('icns-distributed-cloud/yolov5-svhn', 'svhn').fuse().eval()
model = model.autoshape()
img1 = Image.open(r"C:\Users\user1\Documents\third.jpg")
# Prediction
h, w = img1.size
pred = []
filter1 = ImageEnhance.Color(img1)
img1 = filter1.enhance(0.0)
img1 = img1.resize((h*10, w*10))
img1.show()
def predict(i, j, h, w):
    cropped_img = img1.crop((i, j, i+h, j+w))
    f = model(cropped_img)
    if f[0] != None and f[0][0][4] <= 0.4:
        predict(i, j-w, h, w)
        predict(i, j, h, w+w)
        predict(i-h, j, h, w)
        predict(i, j, h*2, w)
    if f[0] != None and f[0][0][4] >= 0.4:
        img2 = np.array(cropped_img)
        cv2.rectangle(img2, (round(f[0][0][0].item()), round(f[0][0][1].item())),
                      (round(f[0][0][2].item()), round(f[0][0][3].item())), (255, 100, 0), 6)
        plt.imshow(img2)
        plt.title('class : '+str(f[0][0][5].item())+" \n confidence : "+str(f[0][0][4].item()))
        plt.show()
    return f
for i in range(0, h*10, h):
    for j in range(0, w*10, w):
        f = predict(i, j, h, w)
        if f != None:
            pred.append(f)
for i in pred:
    if i[0] != None:
        for x1, y1, x2, y2, conf, clas in i[0]:
            if conf > 0.3:
                print('box: ({}, {}), ({}, {})'.format(x1, y1, x2, y2))
                print('confidence : {}'.format(conf))
                print('class: {}'.format(int(clas)))
                ''''
                print((int(x1/10), int(y1/10)), (int(x2/10), int(y2/10)))
                cv2.rectangle(img2, (int(x1/10), int(y1/10)), (int(x2/10), int(y2/10)), (255, 0, 0), 3)
                plt.imshow(img2)
                plt.show()
                '''

The image which I used is :

hence, obviously my expectation is 4,8,4,6.

Initially it detected only 4 as the image isn't clear. for this purpose, I converted RBG image to gray scale. resized the image, cropped and then resized.

but now, my output is :

Using cache found in C:\Users\Sanmitha/.cache\torch\hub\icns-distributed-cloud_yolov5-svhn_master
from  n    params  module                                  arguments
0                -1  1      3520  models.common.Focus                     [3, 32, 3]
1                -1  1     18560  models.common.Conv                      [32, 64, 3, 2]
2                -1  1     19904  models.common.BottleneckCSP             [64, 64, 1]
3                -1  1     73984  models.common.Conv                      [64, 128, 3, 2]
4                -1  1    161152  models.common.BottleneckCSP             [128, 128, 3]
5                -1  1    295424  models.common.Conv                      [128, 256, 3, 2]
6                -1  1    641792  models.common.BottleneckCSP             [256, 256, 3]
7                -1  1   1180672  models.common.Conv                      [256, 512, 3, 2]
8                -1  1    656896  models.common.SPP                       [512, 512, [5, 9, 13]]        
9                -1  1   1248768  models.common.BottleneckCSP             [512, 512, 1, False]
10                -1  1    131584  models.common.Conv                      [512, 256, 1, 1]
11                -1  1         0  torch.nn.modules.upsampling.Upsample    [None, 2, 'nearest']
12           [-1, 6]  1         0  models.common.Concat                    [1]
13                -1  1    378624  models.common.BottleneckCSP             [512, 256, 1, False]
14                -1  1     33024  models.common.Conv                      [256, 128, 1, 1]
15                -1  1         0  torch.nn.modules.upsampling.Upsample    [None, 2, 'nearest']
16           [-1, 4]  1         0  models.common.Concat                    [1]
17                -1  1     95104  models.common.BottleneckCSP             [256, 128, 1, False]
18                -1  1    147712  models.common.Conv                      [128, 128, 3, 2]
19          [-1, 14]  1         0  models.common.Concat                    [1]
20                -1  1    313088  models.common.BottleneckCSP             [256, 256, 1, False]
21                -1  1    590336  models.common.Conv                      [256, 256, 3, 2]
22          [-1, 10]  1         0  models.common.Concat                    [1]
23                -1  1   1248768  models.common.BottleneckCSP             [512, 512, 1, False]
24      [17
<details>
<summary>英文:</summary>
I used a trained model found in github
model : [SVHN](https://github.com/icns-distributed-cloud/YOLOv5-SVHN)
As given in readme.md,
my main.py:

import torch
import PIL.Image as Image
import PIL.ImageEnhance as ImageEnhance
import cv2
import matplotlib.pyplot as plt
import matplotlib.image as img
import numpy as np
from math import *

model = torch.hub.load('icns-distributed-cloud/yolov5-svhn', 'svhn').fuse().eval()
model = model.autoshape()
img1= Image.open(r"C:\Users\user1\Documents\third.jpg")

Prediction

h,w=img1.size
pred=[]
filter1=ImageEnhance.Color(img1)
img1=filter1.enhance(0.0)
img1=img1.resize((h10,w10))
img1.show()

def predict(i,j,h,w):
cropped_img = img1.crop((i, j, i+h, j+w))
f=model(cropped_img)
if f[0]!=None and f[0][0][4]<=0.4:
predict(i,j-w,h,w)
predict(i,j,h,w+w)
predict(i-h,j,h,w)
predict(i,j,h*2,w)
if f[0]!=None and f[0][0][4]>=0.4:
img2=np.array(cropped_img)
cv2.rectangle(img2,(round(f[0][0][0].item()),round(f[0][0][1].item())),(round(f[0][0][2].item()),round(f[0][0][3].item())),(255,100,0),6)
plt.imshow(img2)
plt.title('class : '+str(f[0][0][5].item())+" \n confidence : "+str(f[0][0][4].item()))
plt.show()
return f

for i in range(0,h10,h):
for j in range(0,w10,w):
f=predict(i,j,h,w)
if f!=None:
pred.append(f)

for i in pred:
if i[0]!=None:
for x1, y1, x2, y2, conf, clas in i[0]:
if conf>0.3:
print('box: ({}, {}), ({}, {})'.format(x1, y1, x2, y2))
print('confidence : {}'.format(conf))
print('class: {}'.format(int(clas)))
'''
print((int(x1/10),int(y1/10)),(int(x2/10),int(y2/10)))
cv2.rectangle(img2,(int(x1/10),int(y1/10)),(int(x2/10),int(y2/10)),(255,0,0),3)
plt.imshow(img2)
plt.show()
'''


The image which I used is :
[![third.jpg](https://i.stack.imgur.com/IDbCb.jpg)](https://i.stack.imgur.com/IDbCb.jpg)
hence, obviously my expectation is 4,8,4,6.
Initially it detected only 4 as the image isn&#39;t clear. for this purpose, I converted RBG image to gray scale. resized the image, cropped and then resized. 
but now, my output is :

Using cache found in C:\Users\Sanmitha/.cache\torch\hub\icns-distributed-cloud_yolov5-svhn_master

             from  n    params  module                                  arguments

0 -1 1 1 -1 1 2 -1 1 3 -1 1 4 5 6 7 8 9 10 11 -1 1 12 [-1, 6] 1 13 14 15 -1 1 16 [-1, 4] 1 17 18 19 [-1, 14] 1 20 21 22 [-1, 10] 1 23 24 [17, 20, 23] 1 Model Summary: 3520 models.common.Focus [3, 32, 3]
18560 models.common.Conv [32, 64, 3, 2]
19904 models.common.BottleneckCSP [64, 64, 1]
73984 models.common.Conv [64, 128, 3, 2]
-1 1 161152 models.common.BottleneckCSP [128, 128, 3]
-1 1 295424 models.common.Conv [128, 256, 3, 2]
-1 1 641792 models.common.BottleneckCSP [256, 256, 3]
-1 1 1180672 models.common.Conv [256, 512, 3, 2]
-1 1 656896 models.common.SPP [512, 512, [5, 9, 13]]
-1 1 1248768 models.common.BottleneckCSP [512, 512, 1, False]
-1 1 131584 models.common.Conv [512, 256, 1, 1]
0 torch.nn.modules.upsampling.Upsample [None, 2, 'nearest']
0 models.common.Concat [1]
-1 1 378624 models.common.BottleneckCSP [512, 256, 1, False]
-1 1 33024 models.common.Conv [256, 128, 1, 1]
0 torch.nn.modules.upsampling.Upsample [None, 2, 'nearest']
0 models.common.Concat [1]
-1 1 95104 models.common.BottleneckCSP [256, 128, 1, False]
-1 1 147712 models.common.Conv [128, 128, 3, 2]
0 models.common.Concat [1]
-1 1 313088 models.common.BottleneckCSP [256, 256, 1, False]
-1 1 590336 models.common.Conv [256, 256, 3, 2]
0 models.common.Concat [1]
-1 1 1248768 models.common.BottleneckCSP [512, 512, 1, False]
43152 models.yolo.Detect [11, [[10, 13, 16, 30, 33, 23], [30, 61, 62, 45, 59, 119], [116, 90, 156, 198, 373, 326]], [128, 256, 512]]
191 layers, 7.28206e+06 parameters, 7.28206e+06 gradients

Fusing layers...
Model Summary: 140 layers, 7.27349e+06 parameters, 7.27349e+06 gradients
Adding autoShape...
C:\Users\user1\AppData\Local\Programs\Python\Python310\lib\site-packages\torch\functional.py:504: UserWarning: torch.meshgrid: in an upcoming release, it will be required to pass the indexing argument. (Triggered internally at ..\aten\src\ATen\native\TensorShape.cpp:3484.)
return _VF.meshgrid(tensors, **kwargs) # type: ignore[attr-defined]
box: (0.0, 0.19499744474887848), (137.242431640625, 128.22288513183594)
confidence : 0.30603668093681335
class: 1
box: (52.23253631591797, 0.0), (190.0, 70.11492156982422)
confidence : 0.41145217418670654
class: 2
box: (54.806610107421875, 3.3701179027557373), (190.0, 278.511474609375)
confidence : 0.4895976781845093
class: 7
box: (55.85855484008789, 0.3548659086227417), (190.0, 197.8501739501953)
confidence : 0.365851491689682
class: 8
box: (0.0, 0.0), (140.3123779296875, 216.81472778320312)
confidence : 0.3351562023162842
class: 4
box: (53.156036376953125, 0.0), (190.0, 72.2713851928711)
confidence : 0.40216246247291565
class: 7
box: (0.0, 0.0), (130.40765380859375, 213.34080505371094)
confidence : 0.3830447793006897
class: 6
box: (51.61797332763672, 0.0), (190.0, 70.91419982910156)
confidence : 0.31078973412513733
class: 7


The detected images are
[![figure1](https://i.stack.imgur.com/qr5AQ.png)](https://i.stack.imgur.com/qr5AQ.png)
[![figure2](https://i.stack.imgur.com/pJlpd.png)](https://i.stack.imgur.com/pJlpd.png)
[![figure3](https://i.stack.imgur.com/jGiY0.png)](https://i.stack.imgur.com/jGiY0.png)
[![figure4](https://i.stack.imgur.com/oEOhy.png)](https://i.stack.imgur.com/oEOhy.png)
[![figure5](https://i.stack.imgur.com/asRk8.png)](https://i.stack.imgur.com/asRk8.png)
[![figure6](https://i.stack.imgur.com/nmi6H.png)](https://i.stack.imgur.com/nmi6H.png)
[![figure7](https://i.stack.imgur.com/rhrbO.png)](https://i.stack.imgur.com/rhrbO.png)
[![figure8](https://i.stack.imgur.com/gBEjo.png)](https://i.stack.imgur.com/gBEjo.png)
[![figure9](https://i.stack.imgur.com/cAlWp.png)](https://i.stack.imgur.com/cAlWp.png)
[![figure10](https://i.stack.imgur.com/iDhES.png)](https://i.stack.imgur.com/iDhES.png)
[![figure11](https://i.stack.imgur.com/Axuqa.png)](https://i.stack.imgur.com/Axuqa.png)
[![figure12](https://i.stack.imgur.com/dz5IN.png)](https://i.stack.imgur.com/dz5IN.png)
[![figure13](https://i.stack.imgur.com/QDEad.png)](https://i.stack.imgur.com/QDEad.png)
I don&#39;t know why this detects other things as digits with good confidence but fails to detect digits with good confidence...
Kindly help me
</details>
# 答案1
**得分**: 2
你可以尝试文本识别，类似于无需训练的OCR。只需运行您的图像并获取输出，然后可能添加一个简单的模糊匹配过滤器。
<details>
<summary>英文:</summary>
You can try out text recognition, something like OCR which needs no training.
Just run through your images and get the o/p and maybe add a simple fuzzy match filter.
[![enter image description here][1]][1]
[1]: https://i.stack.imgur.com/p9FXZ.jpg
</details>
# 答案2
**得分**: 0
首先使用CRAFT：字符区域感知文本检测来识别图像中的文本。然后将裁剪的文本图像传递给OCR，比如Tesseract，以从图像中提取文本。
<details>
<summary>英文:</summary>
First use CRAFT: Character-Region Awareness For Text detection to identify the texts in your image. Then pass the cropped text image to OCR like Tesseract to extract the texts from image.
[![enter image description here][1]][1]
[![enter image description here][2]][2]
[![enter image description here][3]][3]
[1]: https://i.stack.imgur.com/FLGtM.jpg
[2]: https://i.stack.imgur.com/XAUz7.jpg
[3]: https://i.stack.imgur.com/QN7wk.jpg
</details>
# 答案3
**得分**: 0
你可以更新这段代码，它应该适用于你：
```python
import torch
import numpy as np
from PIL import Image, ImageDraw
# 下载并加载YOLOv5 SVHN模型
model = torch.hub.load('icns-distributed-cloud/yolov5-svhn', 'svhn').fuse().eval()
model = model.autoshape()
# 预处理输入图像
def preprocess_image(image_path):
image = Image.open(image_path)
image = image.resize((640, 640))  # 调整大小以适应模型的输入尺寸
image = torch.from_numpy(np.array(image)).float() / 255.0  # 将像素值归一化为0到1之间
image = image.permute(2, 0, 1).unsqueeze(0)  # 添加批处理维度并重新排列通道
return image
# 执行推理
def recognize_numbers(image_path):
image = preprocess_image(image_path)
with torch.no_grad():
prediction = model(image, size=640)
return prediction
# 在图像副本上绘制边界框
def draw_box(image_path, box, output_path):
image = Image.open(image_path)
original_width, original_height = image.size
print(f"图像大小: {original_width}x{original_height}")
draw = ImageDraw.Draw(image)
x1, y1, x2, y2 = box
x1 = int(x1 * original_width / 640)
y1 = int(y1 * original_height / 640)
x2 = int(x2 * original_width / 640)
y2 = int(y2 * original_height / 640)
print(f"新的可绘制边界框 ({x1},{y1},{x2},{y2})")
draw.rectangle((x1, y1, x2, y2), outline='green', width=3)
image.save(output_path)
# 提供图像路径
image_path = '/tmp/IDbCb.jpg'
# 调用函数识别图像中的数字
prediction = recognize_numbers(image_path)
# 找到置信度最高的边界框
highest_confidence = 0
best_box = None
best_class = 0
for pred in prediction[0][0]:
x1, y1, x2, y2, conf, cls = pred[:6].tolist()
if conf > highest_confidence:
highest_confidence = conf
best_box = (int(x1), int(y1), int(x2), int(y2))
best_class = cls
if best_box:
x1, y1, x2, y2 = best_box
print('边界框: ({}, {}), ({}, {})'.format(x1, y1, x2, y2))
print('类别: {}'.format(int(best_class)))
print('最高置信度: {}'.format(highest_confidence))
else:
print('未找到置信度超过阈值的边界框。')
output_path = '/tmp/image-1.png'  # 根据需要更改输出路径和文件名
draw_box(image_path, best_box, output_path)

英文:

you can update this code, it should work for you

import torch
import numpy as np
from PIL import Image, ImageDraw
# Download and load the YOLOv5 SVHN model
model = torch.hub.load(&#39;icns-distributed-cloud/yolov5-svhn&#39;, &#39;svhn&#39;).fuse().eval()
model = model.autoshape()
# Preprocess the input image
def preprocess_image(image_path):
image = Image.open(image_path)
image = image.resize((640, 640))  # Resize to the model&#39;s input size
image = torch.from_numpy(np.array(image)).float() / 255.0  # Normalize pixel values between 0 and 1
image = image.permute(2, 0, 1).unsqueeze(0)  # Add batch dimension and rearrange channels
return image
# Perform inference
def recognize_numbers(image_path):
image = preprocess_image(image_path)
with torch.no_grad():
prediction = model(image, size=640)
return prediction
# Draw bounding box on image copy
def draw_box(image_path, box, output_path):
image = Image.open(image_path)
original_width, original_height = image.size
print(f&quot;image size: {original_width}x{original_height}&quot;)
draw = ImageDraw.Draw(image)
x1, y1, x2, y2 = box
x1 = int(x1 * original_width / 640)
y1 = int(y1 * original_height / 640)
x2 = int(x2 * original_width / 640)
y2 = int(y2 * original_height / 640)
print(f&quot;new drawable box ({x1},{y1},{x2},{y2})&quot;)
draw.rectangle((x1, y1, x2, y2), outline=&#39;green&#39;, width=3)
image.save(output_path)
# Provide the path to your image
image_path = &#39;/tmp/IDbCb.jpg&#39;
# Call the function to recognize the numbers in the image
prediction = recognize_numbers(image_path)
# Find the box with the highest confidence
highest_confidence = 0
best_box = None
best_class = 0
for pred in prediction[0][0]:
x1, y1, x2, y2, conf, cls = pred[:6].tolist()
if conf &gt; highest_confidence:
highest_confidence = conf
best_box = (int(x1), int(y1), int(x2), int(y2))
best_class = cls
if best_box:
x1, y1, x2, y2 = best_box
print(&#39;Box: ({}, {}), ({}, {})&#39;.format(x1, y1, x2, y2))
print(&#39;Class: {}&#39;.format(int(best_class)))
print(&#39;Highest Confidence: {}&#39;.format(highest_confidence))
else:
print(&#39;No box found with confidence above the threshold.&#39;)
output_path = &#39;/tmp/image-1.png&#39;  # Change the output path and file name as per your requirement
draw_box(image_path, best_box, output_path)

通过集体智慧和协作来改善编程学习和解决问题的方式。致力于成为全球开发者共同参与的知识库，让每个人都能够通过互相帮助和分享经验来进步。

YOLO模型未产生预期结果。

问题

Prediction

跳出内部if语句到外部else。

更新JSON文件中的字典

在Python Pandas中检测Excel列的数据类型

Pytest单元测试出现无法实例化抽象类错误。

如何在Playwright视觉比较中屏蔽多个定位器？

在C++中，可以使用可变模板参数来检索类型的内部类型。

selenium.common.exceptions.StaleElementReferenceException: Message: stale element reference: stale element not found

Creating and opening a URL to log in to Website via Basic Auth with Robot Framework/Selenium (Python)

AG Grid 在上下文菜单中以大文本形式打开

What's the correct way to type hint an empty list as a literal in python?

如何在Highcharts Gantt中更改本地化的星期名称

如何在同一个流中使用多个过滤器和映射函数？

如何使用Map/Set来将代码优化到O(n)？

.NET MAUI Android在GitHub Actions上构建失败，错误代码为1。